added schema.org breadcrumb counter to parser and solr schema

pull/1/head
Michael Peter Christen 12 years ago
parent a06930662c
commit c3e5f667a7

@ -371,6 +371,9 @@ host_organization_s
#h5_i
#h6_i
## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page
#schema_org_breadcrumb_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt

@ -164,7 +164,9 @@ public enum YaCySchema implements Schema {
h4_i(SolrType.integer, true, true, false, "number of h4 header lines"),
h5_i(SolrType.integer, true, true, false, "number of h5 header lines"),
h6_i(SolrType.integer, true, true, false, "number of h6 header lines"),
schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),

@ -138,6 +138,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private double lon, lat;
private MultiProtocolURI canonical;
private final int maxLinks;
private int breadcrumbs;
/**
@ -186,6 +187,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.lat = 0.0d;
this.evaluationScores.match(Element.url, root.toNormalform(false, false));
this.canonical = null;
this.breadcrumbs = 0;
}
@Override
@ -356,6 +358,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tagname.equalsIgnoreCase("div")) {
final String id = tagopts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", EMPTY_STRING);
final String content = tagopts.getProperty("content", EMPTY_STRING);
@ -652,6 +658,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return false;
}
public int breadcrumbCount() {
return this.breadcrumbs;
}
public String getText() {
try {
return this.content.toString();

@ -447,8 +447,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h4_txt, hs); add(doc, YaCySchema.h4_i, hs.length);
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h5_txt, hs); add(doc, YaCySchema.h5_i, hs.length);
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h6_txt, hs); add(doc, YaCySchema.h6_i, hs.length);
add(doc, YaCySchema.htags_i, h);
add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount());
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)

Loading…
Cancel
Save