a document. This is the upper limit for the clickdepth_i value which may
be shorter in case that the crawler did not take the shortest path to
the document.
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
clickdepth_i
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
crawldepth_i
## needed (post-)processing steps on this metadata set
@ -377,10 +377,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}else{
clickdepth=999;
}
processTypes.add(ProcessType.CLICKDEPTH);// postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
if(document.getDepth()<2)clickdepth=Math.min(clickdepth,document.getDepth());// thats not true if the start url was not a root URL. We need a test for that.
if(clickdepth>2)processTypes.add(ProcessType.CLICKDEPTH);// postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc,clickdepth);// no lazy value checking to get a '0' into the index
@ -58,6 +58,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_external_i(SolrType.num_integer,true,true,false,false,false,"number of unique http references from external hosts"),
references_exthosts_i(SolrType.num_integer,true,true,false,false,false,"number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
crawldepth_i(SolrType.num_integer,true,true,false,false,false,"crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
process_sxt(SolrType.string,true,true,true,false,false,"needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string,true,true,false,false,false,"key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),