diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 6f336a8c3..757605930 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -9,30 +9,80 @@ ## - all non-empty lines not beginning with '#' are keyword lines ## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines -##url of document, string -sku +### mandatory values, do not disable them, YaCy won't work without them -## primary key of document, the URL hash, string +## primary key of document, the URL hash, string (mandatory field) id +##url of document, string (mandatory field) +sku + +## last-modified from http header, date (mandatory field) +last_modified + +## mime-type of document, string (mandatory field) +content_type + +## content of title tag, text (mandatory field) +title + +## id of the host, a 6-byte hash that is part of the document id (mandatory field) +host_id_s + +## the md5 of the raw source (mandatory field) +md5_s + +## the size of the raw source (mandatory field) +size_i + +## index creation comment (mandatory field) +process_s + +## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field) +failreason_t + +## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field) +httpstatus_i + + +### optional but highly recommended values, part of the index distribution process + +## time when resource was loaded +load_date_dt + +## date until resource shall be considered as fresh +fresh_date_dt + +## ids of referrer to this document +referrer_id_txt + +## the name of the publisher of the document +publisher_t + +## the language used in the document; starts with primary language +language_txt + +## number of links to audio resources +audiolinkscount_i + +## number of links to video resources +videolinkscount_i + +## number of links to application resources +applinkscount_i + + +### optional but highly recommended values, not part of the index distribution process + ## longitude of location as declared in WSG84, tdouble lon_coordinate ## longitude of location as declared in WSG84, tdouble lat_coordinate -## last-modified from http header, date -last_modified - ## ip of host of url (after DNS lookup), string ip_s -## mime-type of document, string -content_type - -## content of title tag, text -title - ## content of author-tag, texgen author @@ -45,20 +95,47 @@ keywords ## character encoding, string charset_s +## number of words in visible area, int +wordcount_i + +## total number of inbound links, int +inboundlinkscount_i + +## number of inbound links with nofollow tag, int +inboundlinksnofollowcount_i + +## external number of inbound links, int +outboundlinkscount_i + +## number of external links with nofollow tag, int +outboundlinksnofollowcount_i + +## number of images, int +imagescount_i + +## response time of target server in milliseconds, int +responsetime_i + +## all visible text, text +text_t + + +### optional values, not part of standard YaCy handling (but useful for external applications) + ## tags of css entries, normalized with absolute URL, textgen -css_tag_txt +#css_tag_txt ## urls of css entries, normalized with absolute URL, textgen -css_url_txt +#css_url_txt ## number of css entries, int -csscount_i +#csscount_i ## urls of script entries, normalized with absolute URL, textgen -scripts_txt +#scripts_txt ## number of script entries, int -scriptscount_i +#scriptscount_i ## encoded as binary value into an integer: ## bit 0: "all" contained in html header meta @@ -71,22 +148,13 @@ scriptscount_i ## bit 11: "nofollow" contained in http header properties ## bit 12: "unavailable_after" contained in http header properties ## content of tag and the "X-Robots-Tag" HTTP property -robots_i - -## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int -httpstatus_i +#robots_i ## content of tag, text -metagenerator_t - -## all visible text, text -text_t - -## number of words in visible area, int -wordcount_i +#metagenerator_t ## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen -inboundlinks_tag_txt +#inboundlinks_tag_txt ## internal links, only the protocol #inboundlinks_protocol_txt @@ -106,14 +174,8 @@ inboundlinks_tag_txt ## internal links, the text content of the a-tag #inboundlinks_text_txt -## total number of inbound links, int -inboundlinkscount_i - -## number of inbound links with nofollow tag, int -inboundlinksnofollowcount_i - ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen -outboundlinks_tag_txt +#outboundlinks_tag_txt ## external links, only the protocol #outboundlinks_protocol_txt @@ -133,14 +195,8 @@ outboundlinks_tag_txt ## external links, the text content of the a-tag #outboundlinks_text_txt -## external number of inbound links, int -outboundlinkscount_i - -## number of external links with nofollow tag, int -outboundlinksnofollowcount_i - ## all image tags, encoded as tag inclusive alt- and title property, textgen -images_tag_txt +#images_tag_txt ## all image links without the protocol and '://' #images_urlstub_txt @@ -151,9 +207,6 @@ images_tag_txt ## all image link alt tag #images_alt_txt -## number of images, int -imagescount_i - ## h1 header, textgen h1_txt @@ -161,37 +214,37 @@ h1_txt h2_txt ## h3 header, textgen -h3_txt +#h3_txt ## h4 header, textgen -h4_txt +#h4_txt ## h5 header, textgen -h5_txt +#h5_txt ## h6 header, textgen -h6_txt +#h6_txt ## binary pattern for the existance of h1..h6 headlines, int -htags_i +#htags_i ## all path elements in the url, textgen -paths_txt +#paths_txt ## host of the url, string -host_s +#host_s ## url inside the canonical link element, string -canonical_s +#canonical_s ## link from the url property inside the refresh link element, string -refresh_s +#refresh_s ## all texts in
  • tags, textgen -li_txt +#li_txt ## number of
  • tags, int -licount_i +#licount_i ## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen bold_txt @@ -200,7 +253,7 @@ bold_txt #bold_val ## total number of occurrences of or , int -boldcount_i +#boldcount_i ## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen italic_txt @@ -209,22 +262,22 @@ italic_txt #italic_val ## total number of occurrences of , int -italiccount_i +#italiccount_i ## flag that shows if a swf file is linked, boolean -flash_b +#flash_b ## list of all links to frames, textgen -frames_txt +#frames_txt ## number of attr_frames, int -framesscount_i +#framesscount_i ## list of all links to iframes, textgen -iframes_txt +#iframes_txt ## number of attr_iframes, int -iframesscount_i +#iframesscount_i ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen #ext_cms_txt @@ -261,48 +314,3 @@ iframesscount_i ## number of matching title expressions, textgen #ext_title_val - -## fail reason if a page was not loaded. if the page was loaded then this field is empty, text -failreason_t - -## response time of target server in milliseconds, int -responsetime_i - - -### values used additionally by URIMetadataRow, part of the index transfer process - -## time when resource was loaded -load_date_dt - -## date until resource shall be considered as fresh -fresh_date_dt - -## id of the host, a 6-byte hash that is part of the document id -host_id_s - -## ids of referrer to this document -referrer_id_txt - -## the md5 of the raw source -md5_s - -## the name of the publisher of the document -publisher_t - -## the language used in the document; starts with primary language -language_txt - -## the size of the raw source -size_i - -## number of links to audio resources -audiolinkscount_i - -## number of links to video resources -videolinkscount_i - -## number of links to application resources -applinkscount_i - -## index creation comment -process_s \ No newline at end of file diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java index d028541e9..135b1adb9 100644 --- a/source/net/yacy/search/index/YaCySchema.java +++ b/source/net/yacy/search/index/YaCySchema.java @@ -29,19 +29,52 @@ import net.yacy.cora.services.federated.solr.SolrType; public enum YaCySchema implements Schema { + // mandatory id(SolrType.string, true, true, "primary key of document, the URL hash **mandatory field**"), sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"), - ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"), - host_s(SolrType.string, true, true, "host of the url"), + last_modified(SolrType.date, true, true, "last-modified from http header"), + content_type(SolrType.string, true, true, true, "mime-type of document"), title(SolrType.text_general, true, true, true, "content of title tag"), + host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash(); + md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5(); + size_i(SolrType.integer, true, true, "the size of the raw source"),// int size(); + process_s(SolrType.string, true, true, "index creation comment"), + failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), + httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), + + // optional but recommended, part of index distribution + load_date_dt(SolrType.date, true, true, "time when resource was loaded"), + fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"), + referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash(); + publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher(); + language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language(); + audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio(); + videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo(); + applinkscount_i(SolrType.integer, true, true, "number of links to application resources"),// int lapp(); + + // optional but recommended + lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"), + lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"), + ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"), author(SolrType.text_general, true, true, "content of author-tag"), description(SolrType.text_general, true, true, "content of description-tag"), - content_type(SolrType.string, true, true, true, "mime-type of document"), - last_modified(SolrType.date, true, true, "last-modified from http header"), keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"), - text_t(SolrType.text_general, true, true, "all visible text"), + charset_s(SolrType.string, true, true, "character encoding"), wordcount_i(SolrType.integer, true, true, "number of words in visible area"), - paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"), + inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"), + inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"), + outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"), + outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"), + imagescount_i(SolrType.integer, true, true, "number of images"), + responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"), + text_t(SolrType.text_general, true, true, "all visible text"), + + // optional values + csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"), + css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"), + css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"), + scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"), + scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"), // encoded as binary value into an integer: // bit 0: "all" contained in html header meta // bit 1: "index" contained in html header meta @@ -53,8 +86,7 @@ public enum YaCySchema implements Schema { // bit 11: "nofollow" contained in http header properties // bit 12: "unavailable_after" contained in http header properties robots_i(SolrType.integer, true, true, "content of tag and the \"X-Robots-Tag\" HTTP property"), - inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"), - inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"), + metagenerator_t(SolrType.text_general, true, true, "content of tag"), inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"), inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"), @@ -62,8 +94,6 @@ public enum YaCySchema implements Schema { inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"), inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"), inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"), - outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"), - outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"), outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"), outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), @@ -71,10 +101,10 @@ public enum YaCySchema implements Schema { outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"), outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"), outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"), - charset_s(SolrType.string, true, true, "character encoding"), - lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"), - lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"), - httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), + images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), + images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), + images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"), + images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"), h1_txt(SolrType.text_general, true, true, true, "h1 header"), h2_txt(SolrType.text_general, true, true, true, "h2 header"), h3_txt(SolrType.text_general, true, true, true, "h3 header"), @@ -82,33 +112,23 @@ public enum YaCySchema implements Schema { h5_txt(SolrType.text_general, true, true, true, "h5 header"), h6_txt(SolrType.text_general, true, true, true, "h6 header"), htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"), + paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"), + host_s(SolrType.string, true, true, "host of the url"), canonical_s(SolrType.string, true, true, "url inside the canonical link element"), refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"), - metagenerator_t(SolrType.text_general, true, true, "content of tag"), - boldcount_i(SolrType.integer, true, true, "total number of occurrences of or "), + li_txt(SolrType.text_general, true, true, true, "all texts in
  • tags"), + licount_i(SolrType.integer, true, true, "number of
  • tags"), bold_txt(SolrType.text_general, true, true, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"), bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"), - italiccount_i(SolrType.integer, true, true, "total number of occurrences of "), + boldcount_i(SolrType.integer, true, true, "total number of occurrences of or "), italic_txt(SolrType.text_general, true, true, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"), italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"), - licount_i(SolrType.integer, true, true, "number of
  • tags"), - li_txt(SolrType.text_general, true, true, true, "all texts in
  • tags"), - imagescount_i(SolrType.integer, true, true, "number of images"), - images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), - images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"), - images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), - images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"), - csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"), - css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"), - css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"), - scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"), - scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"), + italiccount_i(SolrType.integer, true, true, "total number of occurrences of "), + flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"), frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"), framesscount_i(SolrType.integer, true, true, "number of frames_txt"), iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"), iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"), - flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"), - responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"), ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"), ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"), ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"), @@ -120,22 +140,8 @@ public enum YaCySchema implements Schema { ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"), ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), - ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"), - failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), - - // values used additionally by URIMetadataRow - load_date_dt(SolrType.date, true, true, "time when resource was loaded"), - fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"), - host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash(); - referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash(); - md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5(); - publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher(); - language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language(); - size_i(SolrType.integer, true, true, "the size of the raw source"),// int size(); - audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio(); - videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo(); - applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp(); - + ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"); + private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private final SolrType type; private final boolean indexed, stored;