## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean
#description_unique_b
## content of keywords tag; words are separated by space
## content of keywords tag; words are separated by comma, semicolon or space
Stringpropval=tag.opts.getProperty("content");// value for <meta itemprop="" content=""> see https://html.spec.whatwg.org/multipage/microdata.html#values
if(propval==null)propval=tag.opts.getProperty("datetime");// html5 + schema.org#itemprop example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional
@ -654,41 +655,41 @@ public class ContentScraper extends AbstractScraper implements Scraper {
id(SolrType.string,true,true,false,false,false,"primary key of document, the URL hash **mandatory field**",true),
sku(SolrType.string,true,true,false,true,true,"url of document",true),// a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr.
@ -55,7 +55,7 @@ public enum CollectionSchema implements SchemaDeclaration {
outboundlinks_urlstub_sxt(SolrType.string,true,true,true,false,true,"external links, the url only without the protocol",true),// needed to enhance the crawler
images_urlstub_sxt(SolrType.string,true,true,true,false,true,"all image links without the protocol and '://'",true),
images_protocol_sxt(SolrType.string,true,true,true,false,false,"all image link protocols",true),// for correct assembly of image url images_protocol_sxt + images_urlstub_sxt is needed
// optional but recommended, part of index distribution
fresh_date_dt(SolrType.date,true,true,false,false,false,"date until resource shall be considered as fresh"),
referrer_id_s(SolrType.string,true,true,false,false,false,"id of the referrer to this document, discovered during crawling"),// byte[] referrerHash();
@ -64,7 +64,7 @@ public enum CollectionSchema implements SchemaDeclaration {
audiolinkscount_i(SolrType.num_integer,true,true,false,false,false,"number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.num_integer,true,true,false,false,false,"number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.num_integer,true,true,false,false,false,"number of links to application resources"),// int lapp();
// optional but recommended
title_exact_signature_l(SolrType.num_long,true,true,false,false,false,"the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"),
title_unique_b(SolrType.bool,true,true,false,false,false,"flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false"),
@ -97,7 +97,7 @@ public enum CollectionSchema implements SchemaDeclaration {
description_txt(SolrType.text_general,true,true,true,false,true,"content of description-tag(s)"),
description_exact_signature_l(SolrType.num_long,true,true,false,false,false,"the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"),
description_unique_b(SolrType.bool,true,true,false,false,false,"flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general,true,true,false,false,true,"content of keywords tag; words are separated by space"),
keywords(SolrType.text_general,true,true,false,false,true,"content of keywords tag; words are separated by comma, semicolon or space"),
icons_rel_sxt(SolrType.string,true,true,true,false,false,"all icon links relationships space separated (e.g.. 'icon apple-touch-icon')"),
icons_sizes_sxt(SolrType.string,true,true,true,false,false,"all icon sizes space separated (e.g. '16x16 32x32')"),
images_text_t(SolrType.text_general,true,true,false,false,true,"all text/words appearing in image alt texts or the tokenized url"),
images_alt_sxt(SolrType.string,true,true,true,false,true,"all image link alt tag"),// no need to index this; don't turn it into a txt field; use images_text_t instead
images_height_val(SolrType.num_integer,true,true,true,false,false,"size of images:height"),
@ -192,7 +192,7 @@ public enum CollectionSchema implements SchemaDeclaration {
navigation_url_sxt(SolrType.string,true,true,true,false,false,"page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html"),
navigation_type_sxt(SolrType.string,true,true,true,false,false,"page navigation rel property value, can contain one of {top,up,next,prev,first,last}"),
publisher_url_s(SolrType.string,true,true,false,false,false,"publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"),
url_protocol_s(SolrType.string,true,true,false,false,false,"the protocol of the url"),
url_file_name_s(SolrType.string,true,true,false,false,true,"the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"),
url_file_name_tokens_t(SolrType.text_general,true,true,false,false,true,"tokens generated from url_file_name_s which can be used for better matching and result boosting"),
@ -228,15 +228,15 @@ public enum CollectionSchema implements SchemaDeclaration {
opengraph_type_s(SolrType.text_general,true,true,false,false,false,"Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"),
opengraph_url_s(SolrType.text_general,true,true,false,false,false,"Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"),
opengraph_image_s(SolrType.text_general,true,true,false,false,false,"Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"),
// link structure for ranking
cr_host_count_i(SolrType.num_integer,true,true,false,false,false,"the number of documents within a single host"),
cr_host_chance_d(SolrType.num_double,true,true,false,false,false,"the chance to click on this page when randomly clicking on links within on one host"),
cr_host_norm_i(SolrType.num_integer,true,true,false,false,false,"normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"),
// custom rating; values to influence the ranking in combination with boost rules
rating_i(SolrType.num_integer,true,true,false,false,false,"custom rating; to be set with external rating information"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.num_integer,true,true,true,false,false,"number of occurrences of texts in bold_txt"),
italic_val(SolrType.num_integer,true,true,true,false,false,"number of occurrences of texts in italic_txt"),
@ -254,7 +254,7 @@ public enum CollectionSchema implements SchemaDeclaration {
ext_title_txt(SolrType.text_general,true,true,true,false,false,"names matching title expressions"),
ext_title_val(SolrType.num_integer,true,true,true,false,false,"number of matching title expressions"),
vocabularies_sxt(SolrType.string,true,true,true,false,false,"collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies");
publicfinalstaticStringCORE_NAME="collection1";// this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
publicfinalstaticStringVOCABULARY_PREFIX="vocabulary_";// collects all terms that appear for each vocabulary
@ -262,15 +262,15 @@ public enum CollectionSchema implements SchemaDeclaration {
publicfinalstaticStringVOCABULARY_COUNT_SUFFIX="_i";// suffix for the term counter (>=1) that start with VOCABULARY_PREFIX - middle part is vocabulary name
publicfinalstaticStringVOCABULARY_LOGCOUNT_SUFFIX="_log_i";// log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
publicfinalstaticStringVOCABULARY_LOGCOUNTS_SUFFIX="_log_val";// all integers from [0 to log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
privateStringsolrFieldName=null;// solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )