From ac766327d37c4059dcec8544ce565e76fd3923b4 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 24 Feb 2017 11:08:18 +0100 Subject: [PATCH] Switched a few more Solr fields from strictly mandatory to optional --- defaults/solr.collection.schema | 94 +++++++++---------- .../yacy/search/schema/CollectionSchema.java | 34 ++++--- 2 files changed, 61 insertions(+), 67 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 78b342daf..14cd15268 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -18,21 +18,9 @@ sku ## last-modified from http header, date (mandatory field) last_modified -## if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances -dates_in_content_dts - -## the number of entries in dates_in_content_sxt -dates_in_content_count_i - ## time when resource was loaded load_date_dt -## content of itemprop attributes with content='startDate' -startDates_dts - -## content of itemprop attributes with content='endDate' -endDates_dts - ## mime-type of document, string (mandatory field) content_type @@ -57,24 +45,6 @@ failtype_s ## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field) httpstatus_i -## number of unique http references, should be equal to references_internal_i + references_external_i -references_i - -## number of unique http references from same host to referenced url -references_internal_i - -## number of unique http references from external hosts -references_external_i - -## number of external hosts which provide http references -references_exthosts_i - -## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth -crawldepth_i - -## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated. -harvestkey_s - ## the file name extension url_file_ext_s @@ -100,7 +70,31 @@ images_urlstub_sxt images_protocol_sxt -### No more mandatory (have been mandatory in some older YaCy versions) +### optional but highly recommended values, part of the index distribution process + +## date until resource shall be considered as fresh +fresh_date_dt + +## id of the referrer to this document, discovered during crawling +referrer_id_s + +## the name of the publisher of the document +publisher_t + +## the language used in the document +language_s + +## number of links to audio resources +audiolinkscount_i + +## number of links to video resources +videolinkscount_i + +## number of links to application resources +applinkscount_i + + +### optional but highly recommended values, not part of the index distribution process ## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b #title_exact_signature_l @@ -120,33 +114,35 @@ images_protocol_sxt ## needed (post-)processing steps on this metadata set #process_sxt +## if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances +dates_in_content_dts +## the number of entries in dates_in_content_sxt +dates_in_content_count_i -### optional but highly recommended values, part of the index distribution process - -## date until resource shall be considered as fresh -fresh_date_dt - -## id of the referrer to this document, discovered during crawling -referrer_id_s +## content of itemprop attributes with content='startDate' +startDates_dts -## the name of the publisher of the document -publisher_t +## content of itemprop attributes with content='endDate' +endDates_dts -## the language used in the document -language_s +## number of unique http references, should be equal to references_internal_i + references_external_i +references_i -## number of links to audio resources -audiolinkscount_i +## number of unique http references from same host to referenced url +references_internal_i -## number of links to video resources -videolinkscount_i +## number of unique http references from external hosts +references_external_i -## number of links to application resources -applinkscount_i +## number of external hosts which provide http references +references_exthosts_i +## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth +crawldepth_i -### optional but highly recommended values, not part of the index distribution process +## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated. +harvestkey_s ## unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false http_unique_b diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index e4cfa9034..fdba2e39e 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -35,11 +35,7 @@ public enum CollectionSchema implements SchemaDeclaration { sku(SolrType.string, true, true, false, true, true, "url of document", true), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. //sku(SolrType.text_en_splitting_tight, true, true, false, true, true, "url of document"), // a 'sku' is a stock-keeping unit, a unique identifier and a default field in unmodified solr. last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header", true), // date document was last modified, needed for media search and /date operator - dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances", true), - dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt", true), load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded", true), - startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'", true), - endDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='endDate'", true), content_type(SolrType.string, true, true, true, false, false, "mime-type of document", true), title(SolrType.text_general, true, true, true, false, true, "content of title tag", true), host_id_s(SolrType.string, true, true, false, false, false, "id of the host, a 6-byte hash that is part of the document id", true),// String hosthash(); @@ -48,12 +44,6 @@ public enum CollectionSchema implements SchemaDeclaration { failreason_s(SolrType.string, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty", true), failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'", true), httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded", true), - references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i", true), - references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url", true), - references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts", true), - references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references", true), - crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth", true), - harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.", true), url_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension", true), host_organization_s(SolrType.string, true, true, false, false, true, "either the second level domain or, if a ccSLD is used, the third level domain", true), // needed to search in the url inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "internal links, the url only without the protocol", true), // needed for HostBrowser @@ -63,14 +53,6 @@ public enum CollectionSchema implements SchemaDeclaration { images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'", true), images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols", true), // for correct assembly of image url images_protocol_sxt + images_urlstub_sxt is needed - // no more mandatory (have been mandatory in some older YaCy versions) - title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"), - title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false"), - exact_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"), - fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, true, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"), - fuzzy_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"), - process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"), - // optional but recommended, part of index distribution fresh_date_dt(SolrType.date, true, true, false, false, false, "date until resource shall be considered as fresh"), referrer_id_s(SolrType.string, true, true, false, false, false, "id of the referrer to this document, discovered during crawling"),// byte[] referrerHash(); @@ -81,6 +63,22 @@ public enum CollectionSchema implements SchemaDeclaration { applinkscount_i(SolrType.num_integer, true, true, false, false, false, "number of links to application resources"),// int lapp(); // optional but recommended + title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"), + title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false"), + exact_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"), + fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, true, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"), + fuzzy_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"), + process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"), + dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"), + dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"), + startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"), + endDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='endDate'"), + references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"), + references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"), + references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), + references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), + crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth"), + harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."), http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"), www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"), exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),