diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index 6f336a8c3..757605930 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -9,30 +9,80 @@
## - all non-empty lines not beginning with '#' are keyword lines
## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
-##url of document, string
-sku
+### mandatory values, do not disable them, YaCy won't work without them
-## primary key of document, the URL hash, string
+## primary key of document, the URL hash, string (mandatory field)
id
+##url of document, string (mandatory field)
+sku
+
+## last-modified from http header, date (mandatory field)
+last_modified
+
+## mime-type of document, string (mandatory field)
+content_type
+
+## content of title tag, text (mandatory field)
+title
+
+## id of the host, a 6-byte hash that is part of the document id (mandatory field)
+host_id_s
+
+## the md5 of the raw source (mandatory field)
+md5_s
+
+## the size of the raw source (mandatory field)
+size_i
+
+## index creation comment (mandatory field)
+process_s
+
+## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
+failreason_t
+
+## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
+httpstatus_i
+
+
+### optional but highly recommended values, part of the index distribution process
+
+## time when resource was loaded
+load_date_dt
+
+## date until resource shall be considered as fresh
+fresh_date_dt
+
+## ids of referrer to this document
+referrer_id_txt
+
+## the name of the publisher of the document
+publisher_t
+
+## the language used in the document; starts with primary language
+language_txt
+
+## number of links to audio resources
+audiolinkscount_i
+
+## number of links to video resources
+videolinkscount_i
+
+## number of links to application resources
+applinkscount_i
+
+
+### optional but highly recommended values, not part of the index distribution process
+
## longitude of location as declared in WSG84, tdouble
lon_coordinate
## longitude of location as declared in WSG84, tdouble
lat_coordinate
-## last-modified from http header, date
-last_modified
-
## ip of host of url (after DNS lookup), string
ip_s
-## mime-type of document, string
-content_type
-
-## content of title tag, text
-title
-
## content of author-tag, texgen
author
@@ -45,20 +95,47 @@ keywords
## character encoding, string
charset_s
+## number of words in visible area, int
+wordcount_i
+
+## total number of inbound links, int
+inboundlinkscount_i
+
+## number of inbound links with nofollow tag, int
+inboundlinksnofollowcount_i
+
+## external number of inbound links, int
+outboundlinkscount_i
+
+## number of external links with nofollow tag, int
+outboundlinksnofollowcount_i
+
+## number of images, int
+imagescount_i
+
+## response time of target server in milliseconds, int
+responsetime_i
+
+## all visible text, text
+text_t
+
+
+### optional values, not part of standard YaCy handling (but useful for external applications)
+
## tags of css entries, normalized with absolute URL, textgen
-css_tag_txt
+#css_tag_txt
## urls of css entries, normalized with absolute URL, textgen
-css_url_txt
+#css_url_txt
## number of css entries, int
-csscount_i
+#csscount_i
## urls of script entries, normalized with absolute URL, textgen
-scripts_txt
+#scripts_txt
## number of script entries, int
-scriptscount_i
+#scriptscount_i
## encoded as binary value into an integer:
## bit 0: "all" contained in html header meta
@@ -71,22 +148,13 @@ scriptscount_i
## bit 11: "nofollow" contained in http header properties
## bit 12: "unavailable_after" contained in http header properties
## content of tag and the "X-Robots-Tag" HTTP property
-robots_i
-
-## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int
-httpstatus_i
+#robots_i
## content of tag, text
-metagenerator_t
-
-## all visible text, text
-text_t
-
-## number of words in visible area, int
-wordcount_i
+#metagenerator_t
## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen
-inboundlinks_tag_txt
+#inboundlinks_tag_txt
## internal links, only the protocol
#inboundlinks_protocol_txt
@@ -106,14 +174,8 @@ inboundlinks_tag_txt
## internal links, the text content of the a-tag
#inboundlinks_text_txt
-## total number of inbound links, int
-inboundlinkscount_i
-
-## number of inbound links with nofollow tag, int
-inboundlinksnofollowcount_i
-
## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen
-outboundlinks_tag_txt
+#outboundlinks_tag_txt
## external links, only the protocol
#outboundlinks_protocol_txt
@@ -133,14 +195,8 @@ outboundlinks_tag_txt
## external links, the text content of the a-tag
#outboundlinks_text_txt
-## external number of inbound links, int
-outboundlinkscount_i
-
-## number of external links with nofollow tag, int
-outboundlinksnofollowcount_i
-
## all image tags, encoded as
tag inclusive alt- and title property, textgen
-images_tag_txt
+#images_tag_txt
## all image links without the protocol and '://'
#images_urlstub_txt
@@ -151,9 +207,6 @@ images_tag_txt
## all image link alt tag
#images_alt_txt
-## number of images, int
-imagescount_i
-
## h1 header, textgen
h1_txt
@@ -161,37 +214,37 @@ h1_txt
h2_txt
## h3 header, textgen
-h3_txt
+#h3_txt
## h4 header, textgen
-h4_txt
+#h4_txt
## h5 header, textgen
-h5_txt
+#h5_txt
## h6 header, textgen
-h6_txt
+#h6_txt
## binary pattern for the existance of h1..h6 headlines, int
-htags_i
+#htags_i
## all path elements in the url, textgen
-paths_txt
+#paths_txt
## host of the url, string
-host_s
+#host_s
## url inside the canonical link element, string
-canonical_s
+#canonical_s
## link from the url property inside the refresh link element, string
-refresh_s
+#refresh_s
## all texts in tags, textgen
-li_txt
+#li_txt
## number of tags, int
-licount_i
+#licount_i
## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
bold_txt
@@ -200,7 +253,7 @@ bold_txt
#bold_val
## total number of occurrences of or , int
-boldcount_i
+#boldcount_i
## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
italic_txt
@@ -209,22 +262,22 @@ italic_txt
#italic_val
## total number of occurrences of , int
-italiccount_i
+#italiccount_i
## flag that shows if a swf file is linked, boolean
-flash_b
+#flash_b
## list of all links to frames, textgen
-frames_txt
+#frames_txt
## number of attr_frames, int
-framesscount_i
+#framesscount_i
## list of all links to iframes, textgen
-iframes_txt
+#iframes_txt
## number of attr_iframes, int
-iframesscount_i
+#iframesscount_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
#ext_cms_txt
@@ -261,48 +314,3 @@ iframesscount_i
## number of matching title expressions, textgen
#ext_title_val
-
-## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
-failreason_t
-
-## response time of target server in milliseconds, int
-responsetime_i
-
-
-### values used additionally by URIMetadataRow, part of the index transfer process
-
-## time when resource was loaded
-load_date_dt
-
-## date until resource shall be considered as fresh
-fresh_date_dt
-
-## id of the host, a 6-byte hash that is part of the document id
-host_id_s
-
-## ids of referrer to this document
-referrer_id_txt
-
-## the md5 of the raw source
-md5_s
-
-## the name of the publisher of the document
-publisher_t
-
-## the language used in the document; starts with primary language
-language_txt
-
-## the size of the raw source
-size_i
-
-## number of links to audio resources
-audiolinkscount_i
-
-## number of links to video resources
-videolinkscount_i
-
-## number of links to application resources
-applinkscount_i
-
-## index creation comment
-process_s
\ No newline at end of file
diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java
index d028541e9..135b1adb9 100644
--- a/source/net/yacy/search/index/YaCySchema.java
+++ b/source/net/yacy/search/index/YaCySchema.java
@@ -29,19 +29,52 @@ import net.yacy.cora.services.federated.solr.SolrType;
public enum YaCySchema implements Schema {
+ // mandatory
id(SolrType.string, true, true, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
- ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
- host_s(SolrType.string, true, true, "host of the url"),
+ last_modified(SolrType.date, true, true, "last-modified from http header"),
+ content_type(SolrType.string, true, true, true, "mime-type of document"),
title(SolrType.text_general, true, true, true, "content of title tag"),
+ host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
+ md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
+ size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
+ process_s(SolrType.string, true, true, "index creation comment"),
+ failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
+ httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
+
+ // optional but recommended, part of index distribution
+ load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
+ fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
+ referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
+ publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
+ language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
+ audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
+ videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
+ applinkscount_i(SolrType.integer, true, true, "number of links to application resources"),// int lapp();
+
+ // optional but recommended
+ lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"),
+ lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"),
+ ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, "content of author-tag"),
description(SolrType.text_general, true, true, "content of description-tag"),
- content_type(SolrType.string, true, true, true, "mime-type of document"),
- last_modified(SolrType.date, true, true, "last-modified from http header"),
keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"),
- text_t(SolrType.text_general, true, true, "all visible text"),
+ charset_s(SolrType.string, true, true, "character encoding"),
wordcount_i(SolrType.integer, true, true, "number of words in visible area"),
- paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
+ inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
+ inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
+ outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
+ outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
+ imagescount_i(SolrType.integer, true, true, "number of images"),
+ responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
+ text_t(SolrType.text_general, true, true, "all visible text"),
+
+ // optional values
+ csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
+ css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
+ css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
+ scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
+ scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
@@ -53,8 +86,7 @@ public enum YaCySchema implements Schema {
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.integer, true, true, "content of tag and the \"X-Robots-Tag\" HTTP property"),
- inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
- inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
+ metagenerator_t(SolrType.text_general, true, true, "content of tag"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"),
inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
@@ -62,8 +94,6 @@ public enum YaCySchema implements Schema {
inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
- outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
- outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"),
outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
@@ -71,10 +101,10 @@ public enum YaCySchema implements Schema {
outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
- charset_s(SolrType.string, true, true, "character encoding"),
- lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"),
- lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"),
- httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
+ images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as
tag inclusive alt- and title property"),
+ images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
+ images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
+ images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
@@ -82,33 +112,23 @@ public enum YaCySchema implements Schema {
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
+ paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
+ host_s(SolrType.string, true, true, "host of the url"),
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"),
- metagenerator_t(SolrType.text_general, true, true, "content of tag"),
- boldcount_i(SolrType.integer, true, true, "total number of occurrences of or "),
+ li_txt(SolrType.text_general, true, true, true, "all texts in tags"),
+ licount_i(SolrType.integer, true, true, "number of tags"),
bold_txt(SolrType.text_general, true, true, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"),
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
- italiccount_i(SolrType.integer, true, true, "total number of occurrences of "),
+ boldcount_i(SolrType.integer, true, true, "total number of occurrences of or "),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
- licount_i(SolrType.integer, true, true, "number of tags"),
- li_txt(SolrType.text_general, true, true, true, "all texts in tags"),
- imagescount_i(SolrType.integer, true, true, "number of images"),
- images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as
tag inclusive alt- and title property"),
- images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
- images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
- images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
- csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
- css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
- css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
- scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
- scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
+ italiccount_i(SolrType.integer, true, true, "total number of occurrences of "),
+ flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.integer, true, true, "number of frames_txt"),
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"),
- flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
- responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),
@@ -120,22 +140,8 @@ public enum YaCySchema implements Schema {
ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"),
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
- ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"),
- failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
-
- // values used additionally by URIMetadataRow
- load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
- fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
- host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
- referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
- md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
- publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
- language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
- size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
- audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
- videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
- applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp();
-
+ ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions");
+
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;
private final boolean indexed, stored;