This uses an enhanced version of the Nutch/Solr TextProfileSignatue.
As a result, a signature of the document is written to the solr search
index. Additionally for each time when a signature is written, it is
checked if the singature exists already in the index. If the signature
does not exist, the document is marked as unique. The unique attribute
can now be used to sort document lists and bring duplicates to the end
of a result list.
To enable this, a large portion of the search api to Solr had to be
changed. This affected mainly caching of 'exists' searches to enhance
the check for existing signatures and do this without actually doing a
solr query.
Because here the first time a long number is used as value in the Solr
store, also the value naming in the YaCySchema had to be adopted and
normalized. This caused that many files had to be changed.
@ -36,11 +36,16 @@ public enum YaCySchema implements Schema {
title(SolrType.text_general,true,true,true,"content of title tag"),
host_id_s(SolrType.string,true,true,false,"id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
md5_s(SolrType.string,true,true,false,"the md5 of the raw source"),// String md5();
size_i(SolrType.integer,true,true,false,"the size of the raw source"),// int size();
exact_signature_l(SolrType.num_long,true,true,false,"the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),
exact_signature_unique_b(SolrType.bool,true,true,false,"flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"),
fuzzy_signature_l(SolrType.num_long,true,true,false,"64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"),
fuzzy_signature_text_t(SolrType.text_general,true,true,false,"intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
fuzzy_signature_unique_b(SolrType.bool,true,true,false,"flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
size_i(SolrType.num_integer,true,true,false,"the size of the raw source"),// int size();
@ -77,11 +82,11 @@ public enum YaCySchema implements Schema {
// optional values, not part of standard YaCy handling (but useful for external applications)
collection_sxt(SolrType.string,true,true,true,"tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
csscount_i(SolrType.integer,true,true,false,"number of entries in css_tag_txt and css_url_txt"),
csscount_i(SolrType.num_integer,true,true,false,"number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general,true,true,true,"full css tag with normalized url"),
css_url_txt(SolrType.text_general,true,true,true,"normalized urls within a css tag"),
scripts_txt(SolrType.text_general,true,true,true,"normalized urls within a scripts tag"),
scriptscount_i(SolrType.integer,true,true,false,"number of entries in scripts_txt"),
scriptscount_i(SolrType.num_integer,true,true,false,"number of entries in scripts_txt"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
@ -92,57 +97,57 @@ public enum YaCySchema implements Schema {
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.integer,true,true,false,"content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
robots_i(SolrType.num_integer,true,true,false,"content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general,true,true,false,"content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_tag_txt(SolrType.text_general,true,true,true,"internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_sxt(SolrType.string,true,true,true,"internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general,true,true,true,"internal links, the url only without the protocol"),
inboundlinks_name_txt(SolrType.text_general,true,true,true,"internal links, the name property of the a-tag"),
inboundlinks_rel_sxt(SolrType.string,true,true,true,"internal links, the rel property of the a-tag"),
inboundlinks_relflags_val(SolrType.integer,true,true,true,"internal links, the rel property of the a-tag, coded binary"),
inboundlinks_relflags_val(SolrType.num_integer,true,true,true,"internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general,true,true,true,"internal links, the text content of the a-tag"),
inboundlinks_text_chars_val(SolrType.integer,true,true,true,"internal links, the length of the a-tag as number of characters"),
inboundlinks_text_words_val(SolrType.integer,true,true,true,"internal links, the length of the a-tag as number of words"),
inboundlinks_text_chars_val(SolrType.num_integer,true,true,true,"internal links, the length of the a-tag as number of characters"),
inboundlinks_text_words_val(SolrType.num_integer,true,true,true,"internal links, the length of the a-tag as number of words"),
inboundlinks_alttag_txt(SolrType.text_general,true,true,true,"if the link is an image link, this contains the alt tag if the image is also liked as img link"),
outboundlinks_tag_txt(SolrType.text_general,true,true,true,"external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string,true,true,true,"external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general,true,true,true,"external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general,true,true,true,"external links, the name property of the a-tag"),
outboundlinks_rel_sxt(SolrType.string,true,true,true,"external links, the rel property of the a-tag"),
outboundlinks_relflags_val(SolrType.integer,true,true,true,"external links, the rel property of the a-tag, coded binary"),
outboundlinks_relflags_val(SolrType.num_integer,true,true,true,"external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general,true,true,true,"external links, the text content of the a-tag"),
outboundlinks_text_chars_val(SolrType.integer,true,true,true,"external links, the length of the a-tag as number of characters"),
outboundlinks_text_words_val(SolrType.integer,true,true,true,"external links, the length of the a-tag as number of words"),
outboundlinks_text_chars_val(SolrType.num_integer,true,true,true,"external links, the length of the a-tag as number of characters"),
outboundlinks_text_words_val(SolrType.num_integer,true,true,true,"external links, the length of the a-tag as number of words"),
outboundlinks_alttag_txt(SolrType.text_general,true,true,true,"if the link is an image link, this contains the alt tag if the image is also liked as img link"),
images_tag_txt(SolrType.text_general,true,true,true," all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general,true,true,true,"all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.text_general,true,true,true,"all image link protocols"),
images_alt_txt(SolrType.text_general,true,true,true,"all image link alt tag"),
images_withalt_i(SolrType.integer,true,true,false,"number of image links with alt tag"),
htags_i(SolrType.integer,true,true,false,"binary pattern for the existance of h1..h6 headlines"),
images_withalt_i(SolrType.num_integer,true,true,false,"number of image links with alt tag"),
htags_i(SolrType.num_integer,true,true,false,"binary pattern for the existance of h1..h6 headlines"),
canonical_t(SolrType.text_general,true,true,false,"url inside the canonical link element"),
refresh_s(SolrType.string,true,true,false,"link from the url property inside the refresh link element"),
li_txt(SolrType.text_general,true,true,true,"all texts in <li> tags"),
licount_i(SolrType.integer,true,true,false,"number of <li> tags"),
licount_i(SolrType.num_integer,true,true,false,"number of <li> tags"),
bold_txt(SolrType.text_general,true,true,true,"all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
boldcount_i(SolrType.integer,true,true,false,"total number of occurrences of <b> or <strong>"),
boldcount_i(SolrType.num_integer,true,true,false,"total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general,true,true,true,"all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italiccount_i(SolrType.integer,true,true,false,"total number of occurrences of <i>"),
italiccount_i(SolrType.num_integer,true,true,false,"total number of occurrences of <i>"),
underline_txt(SolrType.text_general,true,true,true,"all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
underlinecount_i(SolrType.integer,true,true,false,"total number of occurrences of <u>"),
underlinecount_i(SolrType.num_integer,true,true,false,"total number of occurrences of <u>"),
flash_b(SolrType.bool,true,true,false,"flag that shows if a swf file is linked"),
frames_txt(SolrType.text_general,true,true,true,"list of all links to frames"),
framesscount_i(SolrType.integer,true,true,false,"number of frames_txt"),
framesscount_i(SolrType.num_integer,true,true,false,"number of frames_txt"),
iframes_txt(SolrType.text_general,true,true,true,"list of all links to iframes"),
iframesscount_i(SolrType.integer,true,true,false,"number of iframes_txt"),
iframesscount_i(SolrType.num_integer,true,true,false,"number of iframes_txt"),
url_protocol_s(SolrType.string,true,true,false,"the protocol of the url"),
url_paths_sxt(SolrType.string,true,true,true,"all path elements in the url"),
url_file_ext_s(SolrType.string,true,true,false,"the file name extension"),
url_parameter_i(SolrType.integer,true,true,false,"number of key-value pairs in search part of the url"),
url_parameter_i(SolrType.num_integer,true,true,false,"number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string,true,true,true,"the keys from key-value pairs in the search part of the url"),
url_parameter_value_sxt(SolrType.string,true,true,true,"the values from key-value pairs in the search part of the url"),
url_chars_i(SolrType.integer,true,true,false,"number of all characters in the url == length of sku field"),
url_chars_i(SolrType.num_integer,true,true,false,"number of all characters in the url == length of sku field"),
host_s(SolrType.string,true,true,false,"host of the url"),
host_dnc_s(SolrType.string,true,true,false,"the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
@ -150,43 +155,43 @@ public enum YaCySchema implements Schema {
host_organizationdnc_s(SolrType.string,true,true,false,"the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string,true,true,false,"the remaining part of the host without organizationdnc"),
title_count_i(SolrType.integer,true,true,false,"number of titles (counting the 'title' field) in the document"),
title_chars_val(SolrType.integer,true,true,true,"number of characters for each title"),
title_words_val(SolrType.integer,true,true,true,"number of words in each title"),
title_count_i(SolrType.num_integer,true,true,false,"number of titles (counting the 'title' field) in the document"),
title_chars_val(SolrType.num_integer,true,true,true,"number of characters for each title"),
title_words_val(SolrType.num_integer,true,true,true,"number of words in each title"),
description_count_i(SolrType.integer,true,true,false,"number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"),
description_chars_val(SolrType.integer,true,true,true,"number of characters for each description"),
description_words_val(SolrType.integer,true,true,true,"number of words in each description"),
description_count_i(SolrType.num_integer,true,true,false,"number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"),
description_chars_val(SolrType.num_integer,true,true,true,"number of characters for each description"),
description_words_val(SolrType.num_integer,true,true,true,"number of words in each description"),
h1_i(SolrType.integer,true,true,false,"number of h1 header lines"),
h2_i(SolrType.integer,true,true,false,"number of h2 header lines"),
h3_i(SolrType.integer,true,true,false,"number of h3 header lines"),
h4_i(SolrType.integer,true,true,false,"number of h4 header lines"),
h5_i(SolrType.integer,true,true,false,"number of h5 header lines"),
h6_i(SolrType.integer,true,true,false,"number of h6 header lines"),
h1_i(SolrType.num_integer,true,true,false,"number of h1 header lines"),
h2_i(SolrType.num_integer,true,true,false,"number of h2 header lines"),
h3_i(SolrType.num_integer,true,true,false,"number of h3 header lines"),
h4_i(SolrType.num_integer,true,true,false,"number of h4 header lines"),
h5_i(SolrType.num_integer,true,true,false,"number of h5 header lines"),
h6_i(SolrType.num_integer,true,true,false,"number of h6 header lines"),
schema_org_breadcrumb_i(SolrType.integer,true,true,false,"number of itemprop=\"breadcrumb\" appearances in div tags"),
schema_org_breadcrumb_i(SolrType.num_integer,true,true,false,"number of itemprop=\"breadcrumb\" appearances in div tags"),
opengraph_title_t(SolrType.text_general,true,true,false,"Open Graph Metadata from og:title metadata field, see http://ogp.me/ns#"),
opengraph_type_s(SolrType.text_general,true,true,false,"Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"),
opengraph_url_s(SolrType.text_general,true,true,false,"Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"),
opengraph_image_s(SolrType.text_general,true,true,false,"Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer,true,true,true,"number of occurrences of texts in bold_txt"),
italic_val(SolrType.integer,true,true,true,"number of occurrences of texts in italic_txt"),
underline_val(SolrType.integer,true,true,true,"number of occurrences of texts in underline_txt"),
bold_val(SolrType.num_integer,true,true,true,"number of occurrences of texts in bold_txt"),
italic_val(SolrType.num_integer,true,true,true,"number of occurrences of texts in italic_txt"),
underline_val(SolrType.num_integer,true,true,true,"number of occurrences of texts in underline_txt"),
ext_cms_txt(SolrType.text_general,true,true,true,"names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer,true,true,true,"number of attributes that count for a specific cms in ext_cms_txt"),
ext_cms_val(SolrType.num_integer,true,true,true,"number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general,true,true,true,"names of ad-servers/ad-services"),
ext_ads_val(SolrType.integer,true,true,true,"number of attributes counts in ext_ads_txt"),
ext_ads_val(SolrType.num_integer,true,true,true,"number of attributes counts in ext_ads_txt"),
ext_community_txt(SolrType.text_general,true,true,true,"names of recognized community functions"),
ext_community_val(SolrType.integer,true,true,true,"number of attribute counts in attr_community"),
ext_community_val(SolrType.num_integer,true,true,true,"number of attribute counts in attr_community"),
ext_maps_txt(SolrType.text_general,true,true,true,"names of map services"),
ext_maps_val(SolrType.integer,true,true,true,"number of attribute counts in ext_maps_txt"),
ext_maps_val(SolrType.num_integer,true,true,true,"number of attribute counts in ext_maps_txt"),
ext_tracker_txt(SolrType.text_general,true,true,true,"names of tracker server"),
ext_tracker_val(SolrType.integer,true,true,true,"number of attribute counts in ext_tracker_txt"),
ext_tracker_val(SolrType.num_integer,true,true,true,"number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general,true,true,true,"names matching title expressions"),
ext_title_val(SolrType.integer,true,true,true,"number of matching title expressions");
ext_title_val(SolrType.num_integer,true,true,true,"number of matching title expressions");
privateStringsolrFieldName=null;// solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
privatefinalSolrTypetype;
@ -269,16 +274,19 @@ public enum YaCySchema implements Schema {