From 2ddc33646ab650e0b631a0b9ad2b5efb8fa5e9eb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 29 Aug 2012 16:11:23 +0200 Subject: [PATCH] added new field for solr: url_paths_sxt url_parameter_i url_parameter_key_sxt url_parameter_value_sxt url_chars_i --- defaults/solr.keys.list | 88 ++++++++------- .../yacy/cora/document/MultiProtocolURI.java | 106 ++++++++++-------- .../yacy/search/index/SolrConfiguration.java | 42 +++++-- source/net/yacy/search/index/YaCySchema.java | 13 ++- 4 files changed, 151 insertions(+), 98 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index c150e29d5..a28f28b65 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -89,7 +89,7 @@ author ## content of description-tag, text description -## content of keywords tag; words are separated by space, textgen +## content of keywords tag; words are separated by space keywords ## character encoding, string @@ -119,37 +119,37 @@ responsetime_i ## all visible text, text text_t -## h1 header, textgen +## h1 header h1_txt -## h2 header, textgen +## h2 header h2_txt -## h3 header, textgen +## h3 header h3_txt -## h4 header, textgen +## h4 header h4_txt -## h5 header, textgen +## h5 header h5_txt -## h6 header, textgen +## h6 header h6_txt ### optional values, not part of standard YaCy handling (but useful for external applications) -## tags of css entries, normalized with absolute URL, textgen +## tags of css entries, normalized with absolute URL #css_tag_txt -## urls of css entries, normalized with absolute URL, textgen +## urls of css entries, normalized with absolute URL #css_url_txt ## number of css entries, int #csscount_i -## urls of script entries, normalized with absolute URL, textgen +## urls of script entries, normalized with absolute URL #scripts_txt ## number of script entries, int @@ -171,7 +171,7 @@ h6_txt ## content of tag, text #metagenerator_t -## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen +## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow #inboundlinks_tag_txt ## internal links, only the protocol @@ -192,7 +192,7 @@ h6_txt ## internal links, the text content of the a-tag #inboundlinks_text_txt -## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen +## external links, normalized (absolute URLs), as - tag with anchor text and nofollow #outboundlinks_tag_txt ## external links, only the protocol @@ -213,7 +213,7 @@ h6_txt ## external links, the text content of the a-tag #outboundlinks_text_txt -## all image tags, encoded as tag inclusive alt- and title property, textgen +## all image tags, encoded as tag inclusive alt- and title property #images_tag_txt ## all image links without the protocol and '://' @@ -228,34 +228,31 @@ h6_txt ## binary pattern for the existance of h1..h6 headlines, int #htags_i -## all path elements in the url, textgen -#paths_txt - ## url inside the canonical link element, string #canonical_t ## link from the url property inside the refresh link element, string #refresh_s -## all texts in
  • tags, textgen +## all texts in
  • tags #li_txt ## number of
  • tags, int #licount_i -## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen +## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order bold_txt -## number of occurrences of texts in bold_txt, textgen +## number of occurrences of texts in bold_txt #bold_val ## total number of occurrences of or , int #boldcount_i -## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen +## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order italic_txt -## number of occurrences of texts in italic_txt, textgen +## number of occurrences of texts in italic_txt #italic_val ## total number of occurrences of , int @@ -264,24 +261,39 @@ italic_txt ## flag that shows if a swf file is linked, boolean #flash_b -## list of all links to frames, textgen +## list of all links to frames #frames_txt ## number of attr_frames, int #framesscount_i -## list of all links to iframes, textgen +## list of all links to iframes #iframes_txt ## number of attr_iframes, int #iframesscount_i +## the protocol of the url +#url_protocol_s + +## all path elements in the url +#url_paths_sxt + +## number of key-value pairs in search part of the url +#url_parameter_i + +## the keys from key-value pairs in the search part of the url +#url_parameter_key_sxt + +## the values from key-value pairs in the search part of the url +#url_parameter_value_sxt + +## number of all characters in the url == length of sku field +#url_chars_i + ## host of the url, string #host_s -## the protocol of the url -#host_protocol_s - ## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used. #host_dnc_s @@ -294,38 +306,38 @@ italic_txt ## the remaining part of the host without organizationdnc #host_subdomain_s -## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen +## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias #ext_cms_txt -##number of attributes that count for a specific cms in attr_cms, textgen +##number of attributes that count for a specific cms in attr_cms #ext_cms_val -## names of ad-servers/ad-services, textgen +## names of ad-servers/ad-services #ext_ads_txt -## number of attributes counts in attr_ads, textgen +## number of attributes counts in attr_ads #ext_ads_val -## names of recognized community functions, textgen +## names of recognized community functions #ext_community_txt -## number of attribute counts in attr_community, textgen +## number of attribute counts in attr_community #ext_community_val -## names of map services, textgen +## names of map services #ext_maps_txt -## number of attribute counts in attr_maps, textgen +## number of attribute counts in attr_maps #ext_maps_val -## names of tracker server, textgen +## names of tracker server #ext_tracker_txt -## number of attribute counts in attr_tracker, textgen +## number of attribute counts in attr_tracker #ext_tracker_val -## names matching title expressions, textgen +## names matching title expressions #ext_title_txt -## number of matching title expressions, textgen +## number of matching title expressions #ext_title_val diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 32a0c8538..a03577e96 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -88,7 +88,7 @@ public class MultiProtocolURI implements Serializable, Comparable= 0) throw new MalformedURLException("invalid '&' in host"); this.path = resolveBackpath(this.path); identPort(url, (isHTTP() ? 80 : (isHTTPS() ? 443 : (isFTP() ? 21 : (isSMB() ? 445 : -1))))); - identRef(); - identQuest(); + identAnchor(); + identSearchpart(); escape(); } else { // this is not a http or ftp url @@ -202,8 +202,8 @@ public class MultiProtocolURI implements Serializable, Comparable 0) ? 1 : 0); } - private void escapeRef() { - this.ref = escape(this.ref).toString(); + private void escapeAnchor() { + this.anchor = escape(this.anchor).toString(); } - private void escapeQuest() { - final String[] questp = patternAmp.split(this.quest, -1); - final StringBuilder qtmp = new StringBuilder(this.quest.length() + 10); + private void escapeSearchpart() { + final String[] questp = patternAmp.split(this.searchpart, -1); + final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); for (final String element : questp) { if (element.indexOf('=') != -1) { qtmp.append('&'); @@ -433,7 +433,7 @@ public class MultiProtocolURI implements Serializable, Comparable 0) ? 1 : 0); + this.searchpart = qtmp.substring((qtmp.length() > 0) ? 1 : 0); } private final static String[] hex = { @@ -610,24 +610,24 @@ public class MultiProtocolURI implements Serializable, Comparable map = new LinkedHashMap(); + for (String part: parts) { + int p = part.indexOf('='); + if (p > 0) map.put(part.substring(0, p), part.substring(p + 1)); else map.put(part, ""); + } + return map; } @Override @@ -926,7 +938,7 @@ public class MultiProtocolURI implements Serializable, Comparable 0); + return (this.searchpart != null) && (this.searchpart.length() > 0); } public final boolean isCGI() { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index f95558d2b..34187db76 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -195,12 +195,22 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); add(doc, YaCySchema.id, ASCII.String(md.hash())); - add(doc, YaCySchema.sku, digestURI.toNormalform(true, false)); + String us = digestURI.toNormalform(true, false); + add(doc, YaCySchema.sku, us); if (allAttr || contains(YaCySchema.ip_s)) { final InetAddress address = digestURI.getInetAddress(); if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress()); } - if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol()); + if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol()); + Map searchpart = digestURI.getSearchpartMap(); + if (searchpart == null) { + if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0); + } else { + if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size()); + if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()])); + if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()])); + } + if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length()); String host = null; if ((host = digestURI.getHost()) != null) { String dnc = Domains.getDNC(host); @@ -234,9 +244,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable // path elements of link final String path = digestURI.getPath(); - if (path != null && (allAttr || contains(YaCySchema.paths_txt))) { + if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) { final String[] paths = path.split("/"); - if (paths.length > 0) add(doc, YaCySchema.paths_txt, paths); + if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths); } if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, md.limage()); @@ -291,13 +301,23 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); boolean allAttr = this.isEmpty(); add(doc, YaCySchema.id, id); - add(doc, YaCySchema.sku, digestURI.toNormalform(true, false)); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) + String us = digestURI.toNormalform(true, false); + add(doc, YaCySchema.sku, us); if (allAttr || contains(YaCySchema.ip_s)) { - final InetAddress address = digestURI.getInetAddress(); - if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress()); + final InetAddress address = digestURI.getInetAddress(); + if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress()); + } + if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol()); + Map searchpart = digestURI.getSearchpartMap(); + if (searchpart == null) { + if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0); + } else { + if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size()); + if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()])); + if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()])); } - if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol()); + if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length()); String host = null; if ((host = digestURI.getHost()) != null) { String dnc = Domains.getDNC(host); @@ -326,9 +346,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable // path elements of link final String path = digestURI.getPath(); - if (path != null && (allAttr || contains(YaCySchema.paths_txt))) { + if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) { final String[] paths = path.split("/"); - if (paths.length > 0) add(doc, YaCySchema.paths_txt, paths); + if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths); } // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme @@ -751,7 +771,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final String path = digestURI.getPath(); if (path != null) { final String[] paths = path.split("/"); - if (paths.length > 0) add(solrdoc, YaCySchema.paths_txt, paths); + if (paths.length > 0) add(solrdoc, YaCySchema.url_paths_sxt, paths); } add(solrdoc, YaCySchema.failreason_t, failReason); add(solrdoc, YaCySchema.httpstatus_i, httpstatus); diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java index a18413c63..0ca35f370 100644 --- a/source/net/yacy/search/index/YaCySchema.java +++ b/source/net/yacy/search/index/YaCySchema.java @@ -117,7 +117,6 @@ public enum YaCySchema implements Schema { images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"), images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"), htags_i(SolrType.integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"), - paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"), canonical_t(SolrType.text_general, true, true, false, "url inside the canonical link element"), refresh_s(SolrType.string, true, true, false, "link from the url property inside the refresh link element"), li_txt(SolrType.text_general, true, true, true, "all texts in
  • tags"), @@ -132,13 +131,23 @@ public enum YaCySchema implements Schema { iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"), iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"), + url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"), + url_parameter_i(SolrType.integer, true, true, false, "number of key-value pairs in search part of the url"), + url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"), + url_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url"), + url_chars_i(SolrType.integer, true, true, false, "number of all characters in the url == length of sku field"), + host_s(SolrType.string, true, true, false, "host of the url"), - host_protocol_s(SolrType.string, true, true, false, "the protocol of the url"), + url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"), host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."), host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"), host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"), host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"), + //title_count_i(SolrType.integer, true, true, false, ""), + //title_chars_i(SolrType.integer, true, true, false, ""), + //title_words_i(SolrType.integer, true, true, false, ""), + // special values; can only be used if '_val' type is defined in schema file; this is not standard bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"), italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),