- Added new solr field url_file_name_tokens_t which stores the file name

tokens. This can be used to enhance the ranking.
- Added also a rating_i field as basis for later usage.
- enhanced the tokenization process.
pull/1/head
Michael Peter Christen 11 years ago
parent 6efa7532d2
commit 1b61bd40ed

@ -359,6 +359,9 @@ url_protocol_s
## the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension ## the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension
url_file_name_s url_file_name_s
## tokens generated from url_file_name_s which can be used for better matching and result boosting
url_file_name_tokens_t
## the file name extension ## the file name extension
url_file_ext_s url_file_ext_s
@ -441,7 +444,9 @@ cr_host_chance_d
## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10 ## normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10
cr_host_norm_i cr_host_norm_i
## custom rating; to be set with external rating information
rating_i
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt #ext_cms_txt

@ -836,7 +836,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
* resulting words are not ordered by appearance, but all * resulting words are not ordered by appearance, but all
* @return * @return
*/ */
private static String toTokens(final String s) { public static String toTokens(final String s) {
// remove all non-character & non-number // remove all non-character & non-number
final StringBuilder sb = new StringBuilder(s.length()); final StringBuilder sb = new StringBuilder(s.length());
char c; char c;
@ -854,18 +854,18 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
// split the string into tokens and add all camel-case splitting // split the string into tokens and add all camel-case splitting
final String[] u = CommonPattern.SPACE.split(t); final String[] u = CommonPattern.SPACE.split(t);
final Set<String> token = new LinkedHashSet<String>(); final Set<String> token = new LinkedHashSet<String>();
for (final String r: u) { for (final String r: u) token.add(r);
token.addAll(parseCamelCase(r)); for (final String r: u) token.addAll(parseCamelCase(r));
}
// construct a String again // construct a String again
for (final String v: token) if (v.length() > 1) t += ' ' + v; sb.setLength(0);
return t; for (final String v: token) if (v.length() > 1) sb.append(v).append(' ');
return sb.length() == 0 ? "" : sb.substring(0, sb.length() - 1);
} }
public static enum CharType { low, high, number; } public static enum CharType { low, high, number; }
public static Set<String> parseCamelCase(String s) { private static Set<String> parseCamelCase(String s) {
final Set<String> token = new LinkedHashSet<String>(); final Set<String> token = new LinkedHashSet<String>();
if (s.isEmpty()) return token; if (s.isEmpty()) return token;
int p = 0; int p = 0;

@ -211,10 +211,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// path elements of link // path elements of link
String filename = digestURL.getFileName(); String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename); String extension = MultiProtocolURL.getFileExtension(filename);
String filenameStub = filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename;
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length()); if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol()); if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths()); if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename); if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filenameStub);
if (allAttr || contains(CollectionSchema.url_file_name_tokens_t)) add(doc, CollectionSchema.url_file_name_tokens_t, MultiProtocolURL.toTokens(filenameStub));
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension); if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype)); if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype));

@ -159,6 +159,7 @@ public enum CollectionSchema implements SchemaDeclaration {
url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"), url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
url_file_name_s(SolrType.string, true, true, false, false, true, "the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"), url_file_name_s(SolrType.string, true, true, false, false, true, "the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"),
url_file_name_tokens_t(SolrType.text_general, true, true, false, false, true, "tokens generated from url_file_name_s which can be used for better matching and result boosting"),
url_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension"), url_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension"),
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name"), url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name"),
url_parameter_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url"), url_parameter_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url"),
@ -199,6 +200,9 @@ public enum CollectionSchema implements SchemaDeclaration {
cr_host_chance_d(SolrType.num_double, true, true, false, false, false, "the chance to click on this page when randomly clicking on links within on one host"), cr_host_chance_d(SolrType.num_double, true, true, false, false, false, "the chance to click on this page when randomly clicking on links within on one host"),
cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"), cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "normalization of chance: 0 for lower halve of cr_host_count_i urls, 1 for 1/2 of the remaining and so on. the maximum number is 10"),
// custom rating; values to influence the ranking in combination with boost rules
rating_i(SolrType.num_integer, true, true, false, false, false, "custom rating; to be set with external rating information"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard // special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"), bold_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"), italic_val(SolrType.num_integer, true, true, true, false, false, "number of occurrences of texts in italic_txt"),

Loading…
Cancel
Save