The default schema uses only some of them and the resting search index
has now the following properties:
- webgraph size will have about 40 times as much entries as default
index
- the complete index size will increase and may be about the double size
of current amount
As testing showed, not much indexing performance is lost. The default
index will be smaller (moved fields out of it); thus searching
can be faster.
The new index will cause that some old parts in YaCy can be removed,
i.e. specialized webgraph data and the noload crawler. The new index
will make it possible to:
- search within link texts of linked but not indexed documents (about 20
times of document index in size!!)
- get a very detailed link graph
- enhance ranking using a complete link graph
To get the full access to the new index, the API to solr has now two
access points: one with attribute core=collection1 for the default
search index and core=webgraph to the new webgraph search index. This is
also avaiable for p2p operation but client access is not yet
implemented.
... the core can be searched at <ahref="/solr/select?core=#[core]#&q=*:*&start=0&rows=3">/solr/select?core=#[core]#&q=*:*&start=0&rows=3</a>
finalMap<MultiProtocolURI,ImageEntry>collectedImages=newHashMap<MultiProtocolURI,ImageEntry>();// this is a set that is collected now and joined later to the imagelinks
finalMap<DigestURI,ImageEntry>collectedImages=newHashMap<DigestURI,ImageEntry>();// this is a set that is collected now and joined later to the imagelinks
((freshdate!=null&&freshdate.before(newDate()))?(" AND "+CollectionSchema.load_date_dt.getSolrFieldName()+":[* TO "+ISO8601Formatter.FORMATTER.format(freshdate)+"]"):"");
((freshdate!=null&&freshdate.before(newDate()))?(" AND "+CollectionSchema.load_date_dt.getSolrFieldName()+":[* TO "+ISO8601Formatter.FORMATTER.format(freshdate)+"]"):"");
((freshdate!=null&&freshdate.before(newDate()))?(" AND "+CollectionSchema.load_date_dt.getSolrFieldName()+":[* TO "+ISO8601Formatter.FORMATTER.format(freshdate)+"]"):"");
// switch attribute also in all existing documents (which should be exactly only one!)
SolrDocumentListdocs=this.fulltext.getDefaultConnector().query(checkfield.getSolrFieldName()+":"+checkstring+" AND "+uniquefield.getSolrFieldName()+":true",0,1000);
@ -107,9 +107,13 @@ public enum CollectionSchema implements SchemaDeclaration {
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.num_integer,true,true,false,"content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general,true,true,false,"content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_tag_txt(SolrType.text_general,true,true,true,"internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_sxt(SolrType.string,true,true,true,"internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general,true,true,true,"internal links, the url only without the protocol"),
inboundlinks_tag_txt(SolrType.text_general,true,true,true,"internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string,true,true,true,"external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general,true,true,true,"external links, the url only without the protocol"),
outboundlinks_tag_txt(SolrType.text_general,true,true,true,"external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
/*
inboundlinks_name_txt(SolrType.text_general,true,true,true,"internal links, the name property of the a-tag"),
inboundlinks_rel_sxt(SolrType.string,true,true,true,"internal links, the rel property of the a-tag"),
inboundlinks_relflags_val(SolrType.num_integer,true,true,true,"internal links, the rel property of the a-tag, coded binary"),
@ -117,9 +121,6 @@ public enum CollectionSchema implements SchemaDeclaration {
inboundlinks_text_chars_val(SolrType.num_integer,true,true,true,"internal links, the length of the a-tag as number of characters"),
inboundlinks_text_words_val(SolrType.num_integer,true,true,true,"internal links, the length of the a-tag as number of words"),
inboundlinks_alttag_txt(SolrType.text_general,true,true,true,"if the link is an image link, this contains the alt tag if the image is also liked as img link"),
outboundlinks_tag_txt(SolrType.text_general,true,true,true,"external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string,true,true,true,"external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general,true,true,true,"external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general,true,true,true,"external links, the name property of the a-tag"),
outboundlinks_rel_sxt(SolrType.string,true,true,true,"external links, the rel property of the a-tag"),
outboundlinks_relflags_val(SolrType.num_integer,true,true,true,"external links, the rel property of the a-tag, coded binary"),
@ -127,6 +128,7 @@ public enum CollectionSchema implements SchemaDeclaration {
outboundlinks_text_chars_val(SolrType.num_integer,true,true,true,"external links, the length of the a-tag as number of characters"),
outboundlinks_text_words_val(SolrType.num_integer,true,true,true,"external links, the length of the a-tag as number of words"),
outboundlinks_alttag_txt(SolrType.text_general,true,true,true,"if the link is an image link, this contains the alt tag if the image is also liked as img link"),
*/
images_tag_txt(SolrType.text_general,true,true,true," all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general,true,true,true,"all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.text_general,true,true,true,"all image link protocols"),
id(SolrType.string,true,true,false,"primary key of document, a combination of <source-url-hash><target-url-hash><four-digit-hex-counter> (28 characters)"),
last_modified(SolrType.date,true,true,false,"last-modified from http header"),
load_date_dt(SolrType.date,true,true,false,"time when resource was loaded"),
collection_sxt(SolrType.string,true,true,true,"tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
// source information
source_id_s(SolrType.string,true,true,false,"primary key of document, the URL hash (source)"),
source_url_s(SolrType.string,true,true,false,"the url of the document (source)"),
source_protocol_s(SolrType.string,true,true,false,"the protocol of the url (source)"),
source_urlstub_s(SolrType.string,true,true,false,"the url without the protocol (source)"),
source_file_ext_s(SolrType.string,true,true,false,"the file name extension (source)"),
source_tag_s(SolrType.string,true,true,false,"normalized (absolute URLs), as <a> - tag with anchor text and nofollow (source)"),
source_chars_i(SolrType.num_integer,true,true,false,"number of all characters in the url (source)"),
source_protocol_s(SolrType.string,true,true,false,"the protocol of the url (source)"),
source_path_s(SolrType.string,true,true,true,"path of the url (source)"),
source_path_s(SolrType.string,true,true,false,"path of the url (source)"),
source_path_folders_count_i(SolrType.num_integer,true,true,false,"count of all path elements in the url (source)"),
source_path_folders_sxt(SolrType.string,true,true,true,"all path elements in the url (source)"),
source_parameter_count_i(SolrType.num_integer,true,true,false,"number of key-value pairs in search part of the url (source)"),
@ -47,12 +50,14 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_value_sxt(SolrType.string,true,true,true,"the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer,true,true,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_host_s(SolrType.string,true,true,false,"host of the url"),
source_host_s(SolrType.string,true,true,false,"host of the url (source)"),
source_host_id_s(SolrType.string,true,true,false,"id of the host (source)"),
source_host_dnc_s(SolrType.string,true,true,false,"the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (source)"),
source_host_organization_s(SolrType.string,true,true,false,"either the second level domain or, if a ccSLD is used, the third level domain"),
source_host_organizationdnc_s(SolrType.string,true,true,false,"the organization and dnc concatenated with '.' (source)"),
source_host_subdomain_s(SolrType.string,true,true,false,"the remaining part of the host without organizationdnc (source)"),
// information in the source about the target
target_linktext_t(SolrType.text_general,true,true,false,"the text content of the a-tag (in source, but pointing to a target)"),
target_linktext_charcount_i(SolrType.num_integer,true,true,false,"the length of the a-tag content text as number of characters (in source, but pointing to a target)"),
target_linktext_wordcount_i(SolrType.num_integer,true,true,false,"the length of the a-tag content text as number of words (in source, but pointing to a target)"),
@ -63,14 +68,15 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_rel_s(SolrType.string,true,true,false,"the rel property of the a-tag (in source, but pointing to a target)"),
target_relflags_i(SolrType.num_integer,true,true,false,"the rel property of the a-tag, coded binary (in source, but pointing to a target)"),
// target information
target_id_s(SolrType.string,true,true,false,"primary key of document, the URL hash (target)"),
target_url_s(SolrType.string,true,true,false,"the url of the document (target)"),
target_protocol_s(SolrType.string,true,true,false,"the protocol of the url (target)"),
target_urlstub_s(SolrType.string,true,true,false,"the url without the protocol (target)"),
target_file_ext_s(SolrType.string,true,true,false,"the file name extension (target)"),
target_tag_s(SolrType.string,true,true,false,"normalized (absolute URLs), as <a> - tag with anchor text and nofollow (target)"),
target_chars_i(SolrType.num_integer,true,true,false,"number of all characters in the url (target)"),
target_protocol_s(SolrType.string,true,true,false,"the protocol of the url (target)"),
target_path_s(SolrType.string,true,true,true,"path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer,true,true,true,"count of all path elements in the url (target)"),
target_path_s(SolrType.string,true,true,false,"path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer,true,true,false,"count of all path elements in the url (target)"),
target_path_folders_sxt(SolrType.string,true,true,true,"all path elements in the url (target)"),
target_parameter_count_i(SolrType.num_integer,true,true,false,"number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string,true,true,true,"the keys from key-value pairs in the search part of the url (target)"),
@ -78,11 +84,14 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_clickdepth_i(SolrType.num_integer,true,true,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_host_s(SolrType.string,true,true,false,"host of the url (target)"),
target_host_id_s(SolrType.string,true,true,false,"id of the host (target)"),
target_host_dnc_s(SolrType.string,true,true,false,"the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used (target)"),
target_host_organization_s(SolrType.string,true,true,false,"either the second level domain or, if a ccSLD is used, the third level domain (target)"),
target_host_organizationdnc_s(SolrType.string,true,true,false,"the organization and dnc concatenated with '.' (target)"),
target_host_subdomain_s(SolrType.string,true,true,false,"the remaining part of the host without organizationdnc (target)");
target_host_subdomain_s(SolrType.string,true,true,false,"the remaining part of the host without organizationdnc (target)"),
target_inbound_b(SolrType.bool,true,true,false,"flag shows if the target host is equal to the source host");