From 7e3e45fd04ea6f55968d1f7d949f22f43f4524e7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 9 Oct 2012 17:28:48 +0200 Subject: [PATCH] added Open Graph Metadata default fields, see http://ogp.me/ns# --- defaults/solr.keys.list | 6 ++++++ .../net/yacy/cora/federate/solr/YaCySchema.java | 8 ++++++-- .../yacy/document/parser/html/ContentScraper.java | 15 +++++++++------ .../net/yacy/search/index/SolrConfiguration.java | 7 +++++++ 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 8cb4852ba..ba97e608f 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -374,6 +374,12 @@ host_organization_s ## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page #schema_org_breadcrumb_i +## Open Graph Metadata field, see http://ogp.me/ns# +#opengraph_title_t +#opengraph_type_s +#opengraph_url_s +#opengraph_image_s + ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias #ext_cms_txt diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 8efde1cd3..822d3e6f9 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -164,8 +164,12 @@ public enum YaCySchema implements Schema { h4_i(SolrType.integer, true, true, false, "number of h4 header lines"), h5_i(SolrType.integer, true, true, false, "number of h5 header lines"), h6_i(SolrType.integer, true, true, false, "number of h6 header lines"), - - schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"), + + schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"), + opengraph_title_t(SolrType.text_general, true, true, false, "Open Graph Metadata from og:title metadata field, see http://ogp.me/ns#"), + opengraph_type_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"), + opengraph_url_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"), + opengraph_image_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"), // special values; can only be used if '_val' type is defined in schema file; this is not standard bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"), diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 62a18b8cb..d96df2a1e 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -363,18 +363,21 @@ public class ContentScraper extends AbstractScraper implements Scraper { breadcrumbs++; } } else if (tagname.equalsIgnoreCase("meta")) { - String name = tagopts.getProperty("name", EMPTY_STRING); final String content = tagopts.getProperty("content", EMPTY_STRING); + String name = tagopts.getProperty("name", EMPTY_STRING); if (name.length() > 0) { this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); if (name.toLowerCase().equals("generator")) { this.evaluationScores.match(Element.metagenerator, content); } - } else { - name = tagopts.getProperty("http-equiv", EMPTY_STRING); - if (name.length() > 0) { - this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); - } + } + name = tagopts.getProperty("http-equiv", EMPTY_STRING); + if (name.length() > 0) { + this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); + } + name = tagopts.getProperty("property", EMPTY_STRING); + if (name.length() > 0) { + this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); } } else if (tagname.equalsIgnoreCase("area")) { final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING)); diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index eb06f3838..130f20024 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -451,6 +451,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable add(doc, YaCySchema.htags_i, h); add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount()); + // meta tags: Open Graph properties + String og; + og = html.getMetas().get("og:title"); if (og != null) add(doc, YaCySchema.opengraph_title_t, og); + og = html.getMetas().get("og:type"); if (og != null) add(doc, YaCySchema.opengraph_type_s, og); + og = html.getMetas().get("og:url"); if (og != null) add(doc, YaCySchema.opengraph_url_s, og); + og = html.getMetas().get("og:image"); if (og != null) add(doc, YaCySchema.opengraph_image_s, og); + // noindex and nofollow attributes // from HTML (meta-tag in HTML header: robots) // and HTTP header (x-robots property)