added new field for solr:

url_paths_sxt
url_parameter_i
url_parameter_key_sxt
url_parameter_value_sxt
url_chars_i
pull/1/head
Michael Peter Christen 13 years ago
parent 75d5e3475d
commit 2ddc33646a

@ -89,7 +89,7 @@ author
## content of description-tag, text
description
## content of keywords tag; words are separated by space, textgen
## content of keywords tag; words are separated by space
keywords
## character encoding, string
@ -119,37 +119,37 @@ responsetime_i
## all visible text, text
text_t
## h1 header, textgen
## h1 header
h1_txt
## h2 header, textgen
## h2 header
h2_txt
## h3 header, textgen
## h3 header
h3_txt
## h4 header, textgen
## h4 header
h4_txt
## h5 header, textgen
## h5 header
h5_txt
## h6 header, textgen
## h6 header
h6_txt
### optional values, not part of standard YaCy handling (but useful for external applications)
## tags of css entries, normalized with absolute URL, textgen
## tags of css entries, normalized with absolute URL
#css_tag_txt
## urls of css entries, normalized with absolute URL, textgen
## urls of css entries, normalized with absolute URL
#css_url_txt
## number of css entries, int
#csscount_i
## urls of script entries, normalized with absolute URL, textgen
## urls of script entries, normalized with absolute URL
#scripts_txt
## number of script entries, int
@ -171,7 +171,7 @@ h6_txt
## content of <meta name="generator" content=#content#> tag, text
#metagenerator_t
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow
#inboundlinks_tag_txt
## internal links, only the protocol
@ -192,7 +192,7 @@ h6_txt
## internal links, the text content of the a-tag
#inboundlinks_text_txt
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow
#outboundlinks_tag_txt
## external links, only the protocol
@ -213,7 +213,7 @@ h6_txt
## external links, the text content of the a-tag
#outboundlinks_text_txt
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
## all image tags, encoded as <img> tag inclusive alt- and title property
#images_tag_txt
## all image links without the protocol and '://'
@ -228,34 +228,31 @@ h6_txt
## binary pattern for the existance of h1..h6 headlines, int
#htags_i
## all path elements in the url, textgen
#paths_txt
## url inside the canonical link element, string
#canonical_t
## link from the url property inside the refresh link element, string
#refresh_s
## all texts in <li> tags, textgen
## all texts in <li> tags
#li_txt
## number of <li> tags, int
#licount_i
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order
bold_txt
## number of occurrences of texts in bold_txt, textgen
## number of occurrences of texts in bold_txt
#bold_val
## total number of occurrences of <b> or <strong>, int
#boldcount_i
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen
## all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order
italic_txt
## number of occurrences of texts in italic_txt, textgen
## number of occurrences of texts in italic_txt
#italic_val
## total number of occurrences of <i>, int
@ -264,24 +261,39 @@ italic_txt
## flag that shows if a swf file is linked, boolean
#flash_b
## list of all links to frames, textgen
## list of all links to frames
#frames_txt
## number of attr_frames, int
#framesscount_i
## list of all links to iframes, textgen
## list of all links to iframes
#iframes_txt
## number of attr_iframes, int
#iframesscount_i
## the protocol of the url
#url_protocol_s
## all path elements in the url
#url_paths_sxt
## number of key-value pairs in search part of the url
#url_parameter_i
## the keys from key-value pairs in the search part of the url
#url_parameter_key_sxt
## the values from key-value pairs in the search part of the url
#url_parameter_value_sxt
## number of all characters in the url == length of sku field
#url_chars_i
## host of the url, string
#host_s
## the protocol of the url
#host_protocol_s
## the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used.
#host_dnc_s
@ -294,38 +306,38 @@ italic_txt
## the remaining part of the host without organizationdnc
#host_subdomain_s
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt
##number of attributes that count for a specific cms in attr_cms, textgen
##number of attributes that count for a specific cms in attr_cms
#ext_cms_val
## names of ad-servers/ad-services, textgen
## names of ad-servers/ad-services
#ext_ads_txt
## number of attributes counts in attr_ads, textgen
## number of attributes counts in attr_ads
#ext_ads_val
## names of recognized community functions, textgen
## names of recognized community functions
#ext_community_txt
## number of attribute counts in attr_community, textgen
## number of attribute counts in attr_community
#ext_community_val
## names of map services, textgen
## names of map services
#ext_maps_txt
## number of attribute counts in attr_maps, textgen
## number of attribute counts in attr_maps
#ext_maps_val
## names of tracker server, textgen
## names of tracker server
#ext_tracker_txt
## number of attribute counts in attr_tracker, textgen
## number of attribute counts in attr_tracker
#ext_tracker_val
## names matching title expressions, textgen
## names matching title expressions
#ext_title_txt
## number of matching title expressions, textgen
## number of matching title expressions
#ext_title_val

@ -88,7 +88,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// class variables
protected final String protocol, userInfo;
protected String host, path, quest, ref;
protected String host, path, searchpart, anchor;
protected int port;
protected InetAddress hostAddress;
protected ContentDomain contentDomain;
@ -102,8 +102,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.hostAddress = null;
this.userInfo = null;
this.path = null;
this.quest = null;
this.ref = null;
this.searchpart = null;
this.anchor = null;
this.contentDomain = null;
this.port = -1;
}
@ -118,8 +118,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.hostAddress = null;
this.userInfo = url.userInfo;
this.path = url.path;
this.quest = url.quest;
this.ref = url.ref;
this.searchpart = url.searchpart;
this.anchor = url.anchor;
this.contentDomain = null;
this.port = url.port;
}
@ -187,8 +187,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
if (this.host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host");
this.path = resolveBackpath(this.path);
identPort(url, (isHTTP() ? 80 : (isHTTPS() ? 443 : (isFTP() ? 21 : (isSMB() ? 445 : -1)))));
identRef();
identQuest();
identAnchor();
identSearchpart();
escape();
} else {
// this is not a http or ftp url
@ -202,8 +202,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.host = url.substring(q + 1);
this.path = null;
this.port = -1;
this.quest = null;
this.ref = null;
this.searchpart = null;
this.anchor = null;
} else if (this.protocol.equals("file")) {
// parse file url
final String h = url.substring(p + 1);
@ -229,8 +229,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
this.userInfo = null;
this.port = -1;
this.quest = null;
this.ref = null;
this.searchpart = null;
this.anchor = null;
} else {
throw new MalformedURLException("unknown protocol: " + url);
}
@ -352,12 +352,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
}
}
this.quest = baseURL.quest;
this.ref = baseURL.ref;
this.searchpart = baseURL.searchpart;
this.anchor = baseURL.anchor;
this.path = resolveBackpath(this.path);
identRef();
identQuest();
identAnchor();
identSearchpart();
escape();
}
@ -368,11 +368,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.host = host;
this.port = port;
this.path = path;
this.quest = null;
this.searchpart = null;
this.userInfo = null;
this.ref = null;
identRef();
identQuest();
this.anchor = null;
identAnchor();
identSearchpart();
escape();
}
@ -401,8 +401,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
*/
private void escape() {
if (this.path != null && this.path.indexOf('%') == -1) escapePath();
if (this.quest != null && this.quest.indexOf('%') == -1) escapeQuest();
if (this.ref != null && this.ref.indexOf('%') == -1) escapeRef();
if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
if (this.anchor != null && this.anchor.indexOf('%') == -1) escapeAnchor();
}
private void escapePath() {
@ -415,13 +415,13 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
this.path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
}
private void escapeRef() {
this.ref = escape(this.ref).toString();
private void escapeAnchor() {
this.anchor = escape(this.anchor).toString();
}
private void escapeQuest() {
final String[] questp = patternAmp.split(this.quest, -1);
final StringBuilder qtmp = new StringBuilder(this.quest.length() + 10);
private void escapeSearchpart() {
final String[] questp = patternAmp.split(this.searchpart, -1);
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
for (final String element : questp) {
if (element.indexOf('=') != -1) {
qtmp.append('&');
@ -433,7 +433,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
qtmp.append(escape(element));
}
}
this.quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
this.searchpart = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
}
private final static String[] hex = {
@ -610,24 +610,24 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
}
private void identRef() {
private void identAnchor() {
// identify ref in file
final int r = this.path.indexOf('#');
if (r < 0) {
this.ref = null;
this.anchor = null;
} else {
this.ref = this.path.substring(r + 1);
this.anchor = this.path.substring(r + 1);
this.path = this.path.substring(0, r);
}
}
private void identQuest() {
private void identSearchpart() {
// identify quest in file
final int r = this.path.indexOf('?');
if (r < 0) {
this.quest = null;
this.searchpart = null;
} else {
this.quest = this.path.substring(r + 1);
this.searchpart = this.path.substring(r + 1);
this.path = this.path.substring(0, r);
}
}
@ -640,25 +640,25 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// this is the path plus quest plus ref
// if there is no quest and no ref the result is identical to getPath
// this is defined according to http://java.sun.com/j2se/1.4.2/docs/api/java/net/URL.html#getFile()
if (this.quest == null) {
if (excludeReference || this.ref == null) return this.path;
if (this.searchpart == null) {
if (excludeReference || this.anchor == null) return this.path;
final StringBuilder sb = new StringBuilder(120);
sb.append(this.path);
sb.append('#');
sb.append(this.ref);
sb.append(this.anchor);
return sb.toString();
}
String q = this.quest;
String q = this.searchpart;
if (removeSessionID) {
for (final String sid: sessionIDnames.keySet()) {
if (q.toLowerCase().startsWith(sid.toLowerCase() + "=")) {
final int p = q.indexOf('&');
if (p < 0) {
if (excludeReference || this.ref == null) return this.path;
if (excludeReference || this.anchor == null) return this.path;
final StringBuilder sb = new StringBuilder(120);
sb.append(this.path);
sb.append('#');
sb.append(this.ref);
sb.append(this.anchor);
return sb.toString();
}
q = q.substring(p + 1);
@ -678,9 +678,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
sb.append(this.path);
sb.append('?');
sb.append(q);
if (excludeReference || this.ref == null) return sb.toString();
if (excludeReference || this.anchor == null) return sb.toString();
sb.append('#');
sb.append(this.ref);
sb.append(this.anchor);
return sb.toString();
}
@ -758,11 +758,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
public String getRef() {
return this.ref;
return this.anchor;
}
public void removeRef() {
this.ref = null;
this.anchor = null;
}
/**
@ -773,8 +773,20 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return this.userInfo;
}
public String getQuery() {
return this.quest;
public String getSearchpart() {
return this.searchpart;
}
public Map<String, String> getSearchpartMap() {
if (this.searchpart == null) return null;
this.searchpart = this.searchpart.replaceAll("&amp;", "&");
String[] parts = this.searchpart.split("&");
Map<String, String> map = new LinkedHashMap<String, String>();
for (String part: parts) {
int p = part.indexOf('=');
if (p > 0) map.put(part.substring(0, p), part.substring(p + 1)); else map.put(part, "");
}
return map;
}
@Override
@ -926,7 +938,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
((this.host == null && other.host == null) || (this.host != null && other.host != null && this.host.equals(other.host))) &&
((this.userInfo == null && other.userInfo == null) || (this.userInfo != null && other.userInfo != null && this.userInfo.equals(other.userInfo))) &&
((this.path == null && other.path == null) || (this.path != null && other.path != null && this.path.equals(other.path))) &&
((this.quest == null && other.quest == null) || (this.quest != null && other.quest != null && this.quest.equals(other.quest))) &&
((this.searchpart == null && other.searchpart == null) || (this.searchpart != null && other.searchpart != null && this.searchpart.equals(other.searchpart))) &&
this.port == other.port;
}
@ -936,7 +948,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
public boolean isPOST() {
return (this.quest != null) && (this.quest.length() > 0);
return (this.searchpart != null) && (this.searchpart.length() > 0);
}
public final boolean isCGI() {

@ -195,12 +195,22 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, "");
add(doc, YaCySchema.id, ASCII.String(md.hash()));
add(doc, YaCySchema.sku, digestURI.toNormalform(true, false));
String us = digestURI.toNormalform(true, false);
add(doc, YaCySchema.sku, us);
if (allAttr || contains(YaCySchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
}
if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol());
if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
Map<String, String> searchpart = digestURI.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0);
} else {
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size());
if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
}
if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length());
String host = null;
if ((host = digestURI.getHost()) != null) {
String dnc = Domains.getDNC(host);
@ -234,9 +244,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(YaCySchema.paths_txt))) {
if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
final String[] paths = path.split("/");
if (paths.length > 0) add(doc, YaCySchema.paths_txt, paths);
if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
}
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, md.limage());
@ -291,13 +301,23 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
boolean allAttr = this.isEmpty();
add(doc, YaCySchema.id, id);
add(doc, YaCySchema.sku, digestURI.toNormalform(true, false));
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
String us = digestURI.toNormalform(true, false);
add(doc, YaCySchema.sku, us);
if (allAttr || contains(YaCySchema.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
}
if (allAttr || contains(YaCySchema.host_protocol_s)) add(doc, YaCySchema.host_protocol_s, digestURI.getProtocol());
if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
Map<String, String> searchpart = digestURI.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0);
} else {
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size());
if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
}
if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length());
String host = null;
if ((host = digestURI.getHost()) != null) {
String dnc = Domains.getDNC(host);
@ -326,9 +346,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(YaCySchema.paths_txt))) {
if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
final String[] paths = path.split("/");
if (paths.length > 0) add(doc, YaCySchema.paths_txt, paths);
if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
@ -751,7 +771,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final String path = digestURI.getPath();
if (path != null) {
final String[] paths = path.split("/");
if (paths.length > 0) add(solrdoc, YaCySchema.paths_txt, paths);
if (paths.length > 0) add(solrdoc, YaCySchema.url_paths_sxt, paths);
}
add(solrdoc, YaCySchema.failreason_t, failReason);
add(solrdoc, YaCySchema.httpstatus_i, httpstatus);

@ -117,7 +117,6 @@ public enum YaCySchema implements Schema {
images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
htags_i(SolrType.integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"),
paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
canonical_t(SolrType.text_general, true, true, false, "url inside the canonical link element"),
refresh_s(SolrType.string, true, true, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
@ -132,13 +131,23 @@ public enum YaCySchema implements Schema {
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"),
url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"),
url_parameter_i(SolrType.integer, true, true, false, "number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"),
url_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url"),
url_chars_i(SolrType.integer, true, true, false, "number of all characters in the url == length of sku field"),
host_s(SolrType.string, true, true, false, "host of the url"),
host_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"),
//title_count_i(SolrType.integer, true, true, false, ""),
//title_chars_i(SolrType.integer, true, true, false, ""),
//title_words_i(SolrType.integer, true, true, false, ""),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),

Loading…
Cancel
Save