sorted the solr schema into mandatory and optional fields; reduced

number of used field to reduce solr index size
pull/1/head
orbiter 13 years ago
parent 9b8c8c0f47
commit 716ea0cfe2

@ -9,30 +9,80 @@
## - all non-empty lines not beginning with '#' are keyword lines
## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
##url of document, string
sku
### mandatory values, do not disable them, YaCy won't work without them
## primary key of document, the URL hash, string
## primary key of document, the URL hash, string (mandatory field)
id
##url of document, string (mandatory field)
sku
## last-modified from http header, date (mandatory field)
last_modified
## mime-type of document, string (mandatory field)
content_type
## content of title tag, text (mandatory field)
title
## id of the host, a 6-byte hash that is part of the document id (mandatory field)
host_id_s
## the md5 of the raw source (mandatory field)
md5_s
## the size of the raw source (mandatory field)
size_i
## index creation comment (mandatory field)
process_s
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
### optional but highly recommended values, part of the index distribution process
## time when resource was loaded
load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## ids of referrer to this document
referrer_id_txt
## the name of the publisher of the document
publisher_t
## the language used in the document; starts with primary language
language_txt
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
### optional but highly recommended values, not part of the index distribution process
## longitude of location as declared in WSG84, tdouble
lon_coordinate
## longitude of location as declared in WSG84, tdouble
lat_coordinate
## last-modified from http header, date
last_modified
## ip of host of url (after DNS lookup), string
ip_s
## mime-type of document, string
content_type
## content of title tag, text
title
## content of author-tag, texgen
author
@ -45,20 +95,47 @@ keywords
## character encoding, string
charset_s
## number of words in visible area, int
wordcount_i
## total number of inbound links, int
inboundlinkscount_i
## number of inbound links with nofollow tag, int
inboundlinksnofollowcount_i
## external number of inbound links, int
outboundlinkscount_i
## number of external links with nofollow tag, int
outboundlinksnofollowcount_i
## number of images, int
imagescount_i
## response time of target server in milliseconds, int
responsetime_i
## all visible text, text
text_t
### optional values, not part of standard YaCy handling (but useful for external applications)
## tags of css entries, normalized with absolute URL, textgen
css_tag_txt
#css_tag_txt
## urls of css entries, normalized with absolute URL, textgen
css_url_txt
#css_url_txt
## number of css entries, int
csscount_i
#csscount_i
## urls of script entries, normalized with absolute URL, textgen
scripts_txt
#scripts_txt
## number of script entries, int
scriptscount_i
#scriptscount_i
## encoded as binary value into an integer:
## bit 0: "all" contained in html header meta
@ -71,22 +148,13 @@ scriptscount_i
## bit 11: "nofollow" contained in http header properties
## bit 12: "unavailable_after" contained in http header properties
## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
robots_i
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int
httpstatus_i
#robots_i
## content of <meta name="generator" content=#content#> tag, text
metagenerator_t
## all visible text, text
text_t
## number of words in visible area, int
wordcount_i
#metagenerator_t
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
inboundlinks_tag_txt
#inboundlinks_tag_txt
## internal links, only the protocol
#inboundlinks_protocol_txt
@ -106,14 +174,8 @@ inboundlinks_tag_txt
## internal links, the text content of the a-tag
#inboundlinks_text_txt
## total number of inbound links, int
inboundlinkscount_i
## number of inbound links with nofollow tag, int
inboundlinksnofollowcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
outboundlinks_tag_txt
#outboundlinks_tag_txt
## external links, only the protocol
#outboundlinks_protocol_txt
@ -133,14 +195,8 @@ outboundlinks_tag_txt
## external links, the text content of the a-tag
#outboundlinks_text_txt
## external number of inbound links, int
outboundlinkscount_i
## number of external links with nofollow tag, int
outboundlinksnofollowcount_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
images_tag_txt
#images_tag_txt
## all image links without the protocol and '://'
#images_urlstub_txt
@ -151,9 +207,6 @@ images_tag_txt
## all image link alt tag
#images_alt_txt
## number of images, int
imagescount_i
## h1 header, textgen
h1_txt
@ -161,37 +214,37 @@ h1_txt
h2_txt
## h3 header, textgen
h3_txt
#h3_txt
## h4 header, textgen
h4_txt
#h4_txt
## h5 header, textgen
h5_txt
#h5_txt
## h6 header, textgen
h6_txt
#h6_txt
## binary pattern for the existance of h1..h6 headlines, int
htags_i
#htags_i
## all path elements in the url, textgen
paths_txt
#paths_txt
## host of the url, string
host_s
#host_s
## url inside the canonical link element, string
canonical_s
#canonical_s
## link from the url property inside the refresh link element, string
refresh_s
#refresh_s
## all texts in <li> tags, textgen
li_txt
#li_txt
## number of <li> tags, int
licount_i
#licount_i
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
bold_txt
@ -200,7 +253,7 @@ bold_txt
#bold_val
## total number of occurrences of <b> or <strong>, int
boldcount_i
#boldcount_i
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
italic_txt
@ -209,22 +262,22 @@ italic_txt
#italic_val
## total number of occurrences of <i>, int
italiccount_i
#italiccount_i
## flag that shows if a swf file is linked, boolean
flash_b
#flash_b
## list of all links to frames, textgen
frames_txt
#frames_txt
## number of attr_frames, int
framesscount_i
#framesscount_i
## list of all links to iframes, textgen
iframes_txt
#iframes_txt
## number of attr_iframes, int
iframesscount_i
#iframesscount_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
#ext_cms_txt
@ -261,48 +314,3 @@ iframesscount_i
## number of matching title expressions, textgen
#ext_title_val
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
failreason_t
## response time of target server in milliseconds, int
responsetime_i
### values used additionally by URIMetadataRow, part of the index transfer process
## time when resource was loaded
load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## id of the host, a 6-byte hash that is part of the document id
host_id_s
## ids of referrer to this document
referrer_id_txt
## the md5 of the raw source
md5_s
## the name of the publisher of the document
publisher_t
## the language used in the document; starts with primary language
language_txt
## the size of the raw source
size_i
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
## index creation comment
process_s

@ -29,19 +29,52 @@ import net.yacy.cora.services.federated.solr.SolrType;
public enum YaCySchema implements Schema {
// mandatory
id(SolrType.string, true, true, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
host_s(SolrType.string, true, true, "host of the url"),
last_modified(SolrType.date, true, true, "last-modified from http header"),
content_type(SolrType.string, true, true, true, "mime-type of document"),
title(SolrType.text_general, true, true, true, "content of title tag"),
host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
process_s(SolrType.string, true, true, "index creation comment"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
// optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.integer, true, true, "number of links to application resources"),// int lapp();
// optional but recommended
lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"),
lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"),
ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, "content of author-tag"),
description(SolrType.text_general, true, true, "content of description-tag"),
content_type(SolrType.string, true, true, true, "mime-type of document"),
last_modified(SolrType.date, true, true, "last-modified from http header"),
keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"),
text_t(SolrType.text_general, true, true, "all visible text"),
charset_s(SolrType.string, true, true, "character encoding"),
wordcount_i(SolrType.integer, true, true, "number of words in visible area"),
paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
imagescount_i(SolrType.integer, true, true, "number of images"),
responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
text_t(SolrType.text_general, true, true, "all visible text"),
// optional values
csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
@ -53,8 +86,7 @@ public enum YaCySchema implements Schema {
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.integer, true, true, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
@ -62,8 +94,6 @@ public enum YaCySchema implements Schema {
inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
@ -71,10 +101,10 @@ public enum YaCySchema implements Schema {
outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
charset_s(SolrType.string, true, true, "character encoding"),
lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"),
lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"),
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
@ -82,33 +112,23 @@ public enum YaCySchema implements Schema {
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
host_s(SolrType.string, true, true, "host of the url"),
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"),
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
licount_i(SolrType.integer, true, true, "number of <li> tags"),
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italiccount_i(SolrType.integer, true, true, "total number of occurrences of <i>"),
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
licount_i(SolrType.integer, true, true, "number of <li> tags"),
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
imagescount_i(SolrType.integer, true, true, "number of images"),
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
italiccount_i(SolrType.integer, true, true, "total number of occurrences of <i>"),
flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.integer, true, true, "number of frames_txt"),
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"),
flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),
@ -120,21 +140,7 @@ public enum YaCySchema implements Schema {
ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"),
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
// values used additionally by URIMetadataRow
load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp();
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions");
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;

Loading…
Cancel
Save