diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index c150e29d5..a28f28b65 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -89,7 +89,7 @@ author
## content of description-tag, text
description
-## content of keywords tag; words are separated by space, textgen
+## content of keywords tag; words are separated by space
keywords
## character encoding, string
@@ -119,37 +119,37 @@ responsetime_i
## all visible text, text
text_t
-## h1 header, textgen
+## h1 header
h1_txt
-## h2 header, textgen
+## h2 header
h2_txt
-## h3 header, textgen
+## h3 header
h3_txt
-## h4 header, textgen
+## h4 header
h4_txt
-## h5 header, textgen
+## h5 header
h5_txt
-## h6 header, textgen
+## h6 header
h6_txt
### optional values, not part of standard YaCy handling (but useful for external applications)
-## tags of css entries, normalized with absolute URL, textgen
+## tags of css entries, normalized with absolute URL
#css_tag_txt
-## urls of css entries, normalized with absolute URL, textgen
+## urls of css entries, normalized with absolute URL
#css_url_txt
## number of css entries, int
#csscount_i
-## urls of script entries, normalized with absolute URL, textgen
+## urls of script entries, normalized with absolute URL
#scripts_txt
## number of script entries, int
@@ -171,7 +171,7 @@ h6_txt
## content of tag, text
#metagenerator_t
-## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen
+## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow
#inboundlinks_tag_txt
## internal links, only the protocol
@@ -192,7 +192,7 @@ h6_txt
## internal links, the text content of the a-tag
#inboundlinks_text_txt
-## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen
+## external links, normalized (absolute URLs), as - tag with anchor text and nofollow
#outboundlinks_tag_txt
## external links, only the protocol
@@ -213,7 +213,7 @@ h6_txt
## external links, the text content of the a-tag
#outboundlinks_text_txt
-## all image tags, encoded as
tag inclusive alt- and title property, textgen
+## all image tags, encoded as
tag inclusive alt- and title property
#images_tag_txt
## all image links without the protocol and '://'
@@ -228,34 +228,31 @@ h6_txt
## binary pattern for the existance of h1..h6 headlines, int
#htags_i
-## all path elements in the url, textgen
-#paths_txt
-
## url inside the canonical link element, string
#canonical_t
## link from the url property inside the refresh link element, string
#refresh_s
-## all texts in tags, textgen
+## all texts in tags
#li_txt
## number of tags, int
#licount_i
-## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
+## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order
bold_txt
-## number of occurrences of texts in bold_txt, textgen
+## number of occurrences of texts in bold_txt
#bold_val
## total number of occurrences of or , int
#boldcount_i
-## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
+## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order
italic_txt
-## number of occurrences of texts in italic_txt, textgen
+## number of occurrences of texts in italic_txt
#italic_val
## total number of occurrences of , int
@@ -264,24 +261,39 @@ italic_txt
## flag that shows if a swf file is linked, boolean
#flash_b
-## list of all links to frames, textgen
+## list of all links to frames
#frames_txt
## number of attr_frames, int
#framesscount_i
-## list of all links to iframes, textgen
+## list of all links to iframes
#iframes_txt
## number of attr_iframes, int
#iframesscount_i
+## the protocol of the url
+#url_protocol_s
+
+## all path elements in the url
+#url_paths_sxt
+
+## number of key-value pairs in search part of the url
+#url_parameter_i
+
+## the keys from key-value pairs in the search part of the url
+#url_parameter_key_sxt
+
+## the values from key-value pairs in the search part of the url
+#url_parameter_value_sxt
+
+## number of all characters in the url == length of sku field
+#url_chars_i
+
## host of the url, string
#host_s
-## the protocol of the url
-#host_protocol_s
-
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used.
#host_dnc_s
@@ -294,38 +306,38 @@ italic_txt
## the remaining part of the host without organizationdnc
#host_subdomain_s
-## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
+## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt
-##number of attributes that count for a specific cms in attr_cms, textgen
+##number of attributes that count for a specific cms in attr_cms
#ext_cms_val
-## names of ad-servers/ad-services, textgen
+## names of ad-servers/ad-services
#ext_ads_txt
-## number of attributes counts in attr_ads, textgen
+## number of attributes counts in attr_ads
#ext_ads_val
-## names of recognized community functions, textgen
+## names of recognized community functions
#ext_community_txt
-## number of attribute counts in attr_community, textgen
+## number of attribute counts in attr_community
#ext_community_val
-## names of map services, textgen
+## names of map services
#ext_maps_txt
-## number of attribute counts in attr_maps, textgen
+## number of attribute counts in attr_maps
#ext_maps_val
-## names of tracker server, textgen
+## names of tracker server
#ext_tracker_txt
-## number of attribute counts in attr_tracker, textgen
+## number of attribute counts in attr_tracker
#ext_tracker_val
-## names matching title expressions, textgen
+## names matching title expressions
#ext_title_txt
-## number of matching title expressions, textgen
+## number of matching title expressions
#ext_title_val
diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java
index 32a0c8538..a03577e96 100644
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@@ -88,7 +88,7 @@ public class MultiProtocolURI implements Serializable, Comparable= 0) throw new MalformedURLException("invalid '&' in host");
this.path = resolveBackpath(this.path);
identPort(url, (isHTTP() ? 80 : (isHTTPS() ? 443 : (isFTP() ? 21 : (isSMB() ? 445 : -1)))));
- identRef();
- identQuest();
+ identAnchor();
+ identSearchpart();
escape();
} else {
// this is not a http or ftp url
@@ -202,8 +202,8 @@ public class MultiProtocolURI implements Serializable, Comparable 0) ? 1 : 0);
}
- private void escapeRef() {
- this.ref = escape(this.ref).toString();
+ private void escapeAnchor() {
+ this.anchor = escape(this.anchor).toString();
}
- private void escapeQuest() {
- final String[] questp = patternAmp.split(this.quest, -1);
- final StringBuilder qtmp = new StringBuilder(this.quest.length() + 10);
+ private void escapeSearchpart() {
+ final String[] questp = patternAmp.split(this.searchpart, -1);
+ final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
for (final String element : questp) {
if (element.indexOf('=') != -1) {
qtmp.append('&');
@@ -433,7 +433,7 @@ public class MultiProtocolURI implements Serializable, Comparable 0) ? 1 : 0);
+ this.searchpart = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
}
private final static String[] hex = {
@@ -610,24 +610,24 @@ public class MultiProtocolURI implements Serializable, Comparable getSearchpartMap() {
+ if (this.searchpart == null) return null;
+ this.searchpart = this.searchpart.replaceAll("&", "&");
+ String[] parts = this.searchpart.split("&");
+ Map map = new LinkedHashMap();
+ for (String part: parts) {
+ int p = part.indexOf('=');
+ if (p > 0) map.put(part.substring(0, p), part.substring(p + 1)); else map.put(part, "");
+ }
+ return map;
}
@Override
@@ -926,7 +938,7 @@ public class MultiProtocolURI implements Serializable, Comparable 0);
+ return (this.searchpart != null) && (this.searchpart.length() > 0);
}
public final boolean isCGI() {
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index f95558d2b..34187db76 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -195,12 +195,22 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, "");
add(doc, YaCySchema.id, ASCII.String(md.hash()));
- add(doc, YaCySchema.sku, digestURI.toNormalform(true, false));
+ String us = digestURI.toNormalform(true, false);
+ add(doc, YaCySchema.sku, us);
if (allAttr || contains(YaCySchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
}
- if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol());
+ if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
+ Map searchpart = digestURI.getSearchpartMap();
+ if (searchpart == null) {
+ if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0);
+ } else {
+ if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size());
+ if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
+ if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
+ }
+ if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length());
String host = null;
if ((host = digestURI.getHost()) != null) {
String dnc = Domains.getDNC(host);
@@ -234,9 +244,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// path elements of link
final String path = digestURI.getPath();
- if (path != null && (allAttr || contains(YaCySchema.paths_txt))) {
+ if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
final String[] paths = path.split("/");
- if (paths.length > 0) add(doc, YaCySchema.paths_txt, paths);
+ if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
}
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, md.limage());
@@ -291,13 +301,23 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
boolean allAttr = this.isEmpty();
add(doc, YaCySchema.id, id);
- add(doc, YaCySchema.sku, digestURI.toNormalform(true, false));
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
+ String us = digestURI.toNormalform(true, false);
+ add(doc, YaCySchema.sku, us);
if (allAttr || contains(YaCySchema.ip_s)) {
- final InetAddress address = digestURI.getInetAddress();
- if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
+ final InetAddress address = digestURI.getInetAddress();
+ if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
+ }
+ if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
+ Map searchpart = digestURI.getSearchpartMap();
+ if (searchpart == null) {
+ if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0);
+ } else {
+ if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size());
+ if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
+ if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
}
- if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol());
+ if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length());
String host = null;
if ((host = digestURI.getHost()) != null) {
String dnc = Domains.getDNC(host);
@@ -326,9 +346,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// path elements of link
final String path = digestURI.getPath();
- if (path != null && (allAttr || contains(YaCySchema.paths_txt))) {
+ if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
final String[] paths = path.split("/");
- if (paths.length > 0) add(doc, YaCySchema.paths_txt, paths);
+ if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
@@ -751,7 +771,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final String path = digestURI.getPath();
if (path != null) {
final String[] paths = path.split("/");
- if (paths.length > 0) add(solrdoc, YaCySchema.paths_txt, paths);
+ if (paths.length > 0) add(solrdoc, YaCySchema.url_paths_sxt, paths);
}
add(solrdoc, YaCySchema.failreason_t, failReason);
add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java
index a18413c63..0ca35f370 100644
--- a/source/net/yacy/search/index/YaCySchema.java
+++ b/source/net/yacy/search/index/YaCySchema.java
@@ -117,7 +117,6 @@ public enum YaCySchema implements Schema {
images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
htags_i(SolrType.integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"),
- paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
canonical_t(SolrType.text_general, true, true, false, "url inside the canonical link element"),
refresh_s(SolrType.string, true, true, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, "all texts in tags"),
@@ -132,13 +131,23 @@ public enum YaCySchema implements Schema {
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"),
+ url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"),
+ url_parameter_i(SolrType.integer, true, true, false, "number of key-value pairs in search part of the url"),
+ url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"),
+ url_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url"),
+ url_chars_i(SolrType.integer, true, true, false, "number of all characters in the url == length of sku field"),
+
host_s(SolrType.string, true, true, false, "host of the url"),
- host_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
+ url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"),
+ //title_count_i(SolrType.integer, true, true, false, ""),
+ //title_chars_i(SolrType.integer, true, true, false, ""),
+ //title_words_i(SolrType.integer, true, true, false, ""),
+
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),