added Open Graph Metadata default fields, see http://ogp.me/ns#

pull/1/head
Michael Peter Christen 12 years ago
parent c3e5f667a7
commit 7e3e45fd04

@ -374,6 +374,12 @@ host_organization_s
## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page
#schema_org_breadcrumb_i
## Open Graph Metadata field, see http://ogp.me/ns#
#opengraph_title_t
#opengraph_type_s
#opengraph_url_s
#opengraph_image_s
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt

@ -164,8 +164,12 @@ public enum YaCySchema implements Schema {
h4_i(SolrType.integer, true, true, false, "number of h4 header lines"),
h5_i(SolrType.integer, true, true, false, "number of h5 header lines"),
h6_i(SolrType.integer, true, true, false, "number of h6 header lines"),
schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),
schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),
opengraph_title_t(SolrType.text_general, true, true, false, "Open Graph Metadata from og:title metadata field, see http://ogp.me/ns#"),
opengraph_type_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"),
opengraph_url_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"),
opengraph_image_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),

@ -363,18 +363,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
breadcrumbs++;
}
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", EMPTY_STRING);
final String content = tagopts.getProperty("content", EMPTY_STRING);
String name = tagopts.getProperty("name", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.toLowerCase().equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content);
}
} else {
name = tagopts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
}
name = tagopts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
name = tagopts.getProperty("property", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
} else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));

@ -451,6 +451,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
add(doc, YaCySchema.htags_i, h);
add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount());
// meta tags: Open Graph properties
String og;
og = html.getMetas().get("og:title"); if (og != null) add(doc, YaCySchema.opengraph_title_t, og);
og = html.getMetas().get("og:type"); if (og != null) add(doc, YaCySchema.opengraph_type_s, og);
og = html.getMetas().get("og:url"); if (og != null) add(doc, YaCySchema.opengraph_url_s, og);
og = html.getMetas().get("og:image"); if (og != null) add(doc, YaCySchema.opengraph_image_s, og);
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
// and HTTP header (x-robots property)

Loading…
Cancel
Save