webgraph index which is temporary filled with the crawl profile key.
This is used to select a set of documents for post-processing as soon as
a crawl is finished. Now the postprocessing for a specific crawl is
started when that specific crawl is finished and not at the end of all
post-processing steps.
## needed (post-)processing steps on this metadata set
process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
harvestkey_s
### optional but highly recommended values, part of the index distribution process
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
#process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
@ -59,6 +59,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_exthosts_i(SolrType.num_integer,true,true,false,false,false,"number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string,true,true,true,false,false,"needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string,true,true,false,false,false,"key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// optional but recommended, part of index distribution
load_date_dt(SolrType.date,true,true,false,false,false,"time when resource was loaded"),
@ -231,6 +232,23 @@ public enum CollectionSchema implements SchemaDeclaration {
@ -36,6 +36,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
load_date_dt(SolrType.date,true,true,false,false,false,"time when resource was loaded"),
collection_sxt(SolrType.string,true,true,true,false,false,"tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string,true,true,true,false,false,"needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
harvestkey_s(SolrType.string,true,true,false,false,false,"key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// source information
source_id_s(SolrType.string,true,true,false,false,false,"primary key of document, the URL hash (source)"),
@ -114,6 +115,23 @@ public enum WebgraphSchema implements SchemaDeclaration {