From c3e5f667a75b4865da16efc5c1d2060c89074195 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 9 Oct 2012 13:02:43 +0200 Subject: [PATCH] added schema.org breadcrumb counter to parser and solr schema --- defaults/solr.keys.list | 3 +++ source/net/yacy/cora/federate/solr/YaCySchema.java | 4 +++- .../net/yacy/document/parser/html/ContentScraper.java | 10 ++++++++++ source/net/yacy/search/index/SolrConfiguration.java | 3 ++- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 81f0b322b..8cb4852ba 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -371,6 +371,9 @@ host_organization_s #h5_i #h6_i +## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page +#schema_org_breadcrumb_i + ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias #ext_cms_txt diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 7a749464b..8efde1cd3 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -164,7 +164,9 @@ public enum YaCySchema implements Schema { h4_i(SolrType.integer, true, true, false, "number of h4 header lines"), h5_i(SolrType.integer, true, true, false, "number of h5 header lines"), h6_i(SolrType.integer, true, true, false, "number of h6 header lines"), - + + schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"), + // special values; can only be used if '_val' type is defined in schema file; this is not standard bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"), italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"), diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 2990de966..62a18b8cb 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -138,6 +138,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private double lon, lat; private MultiProtocolURI canonical; private final int maxLinks; + private int breadcrumbs; /** @@ -186,6 +187,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.lat = 0.0d; this.evaluationScores.match(Element.url, root.toNormalform(false, false)); this.canonical = null; + this.breadcrumbs = 0; } @Override @@ -356,6 +358,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if (tagname.equalsIgnoreCase("div")) { final String id = tagopts.getProperty("id", EMPTY_STRING); this.evaluationScores.match(Element.divid, id); + final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING); + if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { + breadcrumbs++; + } } else if (tagname.equalsIgnoreCase("meta")) { String name = tagopts.getProperty("name", EMPTY_STRING); final String content = tagopts.getProperty("content", EMPTY_STRING); @@ -652,6 +658,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { return false; } + public int breadcrumbCount() { + return this.breadcrumbs; + } + public String getText() { try { return this.content.toString(); diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 7af31ea3a..eb06f3838 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -447,8 +447,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h4_txt, hs); add(doc, YaCySchema.h4_i, hs.length); hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h5_txt, hs); add(doc, YaCySchema.h5_i, hs.length); hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h6_txt, hs); add(doc, YaCySchema.h6_i, hs.length); - + add(doc, YaCySchema.htags_i, h); + add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount()); // noindex and nofollow attributes // from HTML (meta-tag in HTML header: robots)