using the author field as solr-native facet. this makes it necessary to

introduce a copy-field for the author field to be copied to a string
field. This field is then used to generate facets. Without this field,
the facet would consist only of the words of the author names, not of
the full author string.
pull/1/head
Michael Peter Christen 12 years ago
parent 2a4c064c89
commit 01200f06cc

@ -116,6 +116,7 @@
<field name="description" type="text_general" indexed="true" stored="true"/>
<field name="comments" type="text_general" indexed="true" stored="true"/>
<field name="author" type="text_general" indexed="true" stored="true"/>
<copyField source="author" dest="author_s" maxChars="100" />
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="category" type="text_general" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>

@ -117,7 +117,7 @@ public final class search {
final String contentdom = post.get("contentdom", "all");
final String filter = post.get("filter", ".*"); // a filter on the url
String sitehash = post.get("sitehash", ""); if (sitehash.isEmpty()) sitehash = null;
String authorhash = post.get("authorhash", ""); if (authorhash.isEmpty()) authorhash = null;
String author = post.get("author", ""); if (author.isEmpty()) author = null;
String language = post.get("language", "");
if (language == null || language.isEmpty() || !ISO639.exists(language)) {
// take language from the user agent
@ -236,7 +236,7 @@ public final class search {
sitehash,
null,
null,
authorhash,
author,
DigestURI.TLD_any_zone_filter,
client,
false,
@ -298,7 +298,7 @@ public final class search {
sitehash,
null,
null,
authorhash,
author,
DigestURI.TLD_any_zone_filter,
client,
false,

@ -531,11 +531,10 @@ public class yacysearch {
}
final int authori = querystring.indexOf("author:", 0);
String authorhash = null;
String author = null;
if ( authori >= 0 ) {
// check if the author was given with single quotes or without
final boolean quotes = (querystring.charAt(authori + 7) == '(');
String author;
if ( quotes ) {
int ftb = querystring.indexOf(')', authori + 8);
if ( ftb == -1 ) {
@ -553,7 +552,6 @@ public class yacysearch {
querystring = querystring.replace("author:" + author, "");
modifier.append("author:").append(author).append(' ');
}
authorhash = ASCII.String(Word.word2hash(author));
}
final int tld = querystring.indexOf("tld:", 0);
@ -738,7 +736,7 @@ public class yacysearch {
sitehash,
sitehost,
DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")),
authorhash,
author,
DigestURI.TLD_any_zone_filter,
client,
authenticated,

@ -64,6 +64,7 @@ public enum YaCySchema implements Schema {
coordinate_p(SolrType.location, true, true, false, "point in degrees of latitude,longitude as declared in WSG84"),
ip_s(SolrType.string, true, true, false, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, false, "content of author-tag"),
author_s(SolrType.string, true, true, false, "content of author-tag as copy-field from author. This is used for facet generation"),
description(SolrType.text_general, true, true, false, "content of description-tag"),
keywords(SolrType.text_general, true, true, false, "content of keywords tag; words are separated by space"),
charset_s(SolrType.string, true, true, false, "character encoding"),

@ -603,7 +603,7 @@ public final class Protocol
final String modifier,
final String language,
final String sitehash,
final String authorhash,
final String author,
final String contentdom,
final int count,
final long time,
@ -641,7 +641,7 @@ public final class Protocol
modifier,
language,
sitehash,
authorhash,
author,
contentdom,
count,
time,
@ -895,7 +895,7 @@ public final class Protocol
final String modifier,
final String language,
final String sitehash,
final String authorhash,
final String author,
final String contentdom,
final int count,
final long time,
@ -944,7 +944,7 @@ public final class Protocol
parts.put("modifier", UTF8.StringBody(modifier));
parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(sitehash));
parts.put("authorhash", UTF8.StringBody(authorhash));
parts.put("author", UTF8.StringBody(author));
parts.put("contentdom", UTF8.StringBody(contentdom));
parts.put("ttl", UTF8.StringBody("0"));
parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance)));

@ -43,7 +43,7 @@ public class RemoteSearch extends Thread {
private static final ThreadGroup ysThreadGroup = new ThreadGroup("yacySearchThreadGroup");
final private SearchEvent event;
final private String wordhashes, excludehashes, sitehash, authorhash, contentdom;
final private String wordhashes, excludehashes, sitehash, author, contentdom;
final private int partitions;
final private SecondarySearchSuperviser secondarySearchSuperviser;
final private Blacklist blacklist;
@ -61,7 +61,7 @@ public class RemoteSearch extends Thread {
final QueryParams.Modifier modifier,
final String language,
final String sitehash,
final String authorhash,
final String author,
final String contentdom,
final int count,
final long time,
@ -77,7 +77,7 @@ public class RemoteSearch extends Thread {
this.modifier = modifier;
this.language = language;
this.sitehash = sitehash;
this.authorhash = authorhash;
this.author = author;
this.contentdom = contentdom;
this.partitions = partitions;
this.secondarySearchSuperviser = secondarySearchSuperviser;
@ -100,7 +100,7 @@ public class RemoteSearch extends Thread {
this.modifier.getModifier(),
this.language,
this.sitehash,
this.authorhash,
this.author,
this.contentdom,
this.count,
this.time,
@ -177,7 +177,7 @@ public class RemoteSearch extends Thread {
event.query.modifier,
event.query.targetlang == null ? "" : event.query.targetlang,
event.query.nav_sitehash == null ? "" : event.query.nav_sitehash,
event.query.authorhash == null ? "" : event.query.authorhash,
event.query.author == null ? "" : event.query.author,
event.query.contentdom == null ? "all" : event.query.contentdom.toString(),
count,
time,

@ -87,7 +87,7 @@ public final class QueryParams {
YaCySchema.host_s.getSolrFieldName(),
YaCySchema.url_protocol_s.getSolrFieldName(),
YaCySchema.url_file_ext_s.getSolrFieldName(),
YaCySchema.author.getSolrFieldName()};
YaCySchema.author_s.getSolrFieldName()};
private static final int defaultmaxfacets = 30;
@ -128,7 +128,7 @@ public final class QueryParams {
private final String nav_sitehost; // this is a domain name which is used to navigate to that host
public final String nav_sitehash; // this is a domain hash, 6 bytes long or null
protected final Set<String> siteexcludes; // set of domain hashes that are excluded if not included by sitehash
public final String authorhash;
public final String author;
public final Modifier modifier;
public Seed remotepeer;
public final long starttime; // the time when the query started, how long it should take and the time when the timeout is reached (milliseconds)
@ -181,7 +181,7 @@ public final class QueryParams {
this.nav_sitehash = null;
this.nav_sitehost = null;
this.siteexcludes = null;
this.authorhash = null;
this.author = null;
this.remotepeer = null;
this.starttime = Long.valueOf(System.currentTimeMillis());
this.maxtime = 10000;
@ -219,7 +219,7 @@ public final class QueryParams {
final String nav_sitehash,
final String nav_sitehost,
final Set<String> siteexcludes,
final String authorhash,
final String author,
final int domainzone,
final String host,
final boolean specialRights,
@ -257,7 +257,7 @@ public final class QueryParams {
this.nav_sitehash = nav_sitehash; assert nav_sitehash == null || nav_sitehash.length() == 6;
this.nav_sitehost = nav_sitehost;
this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null: siteexcludes;
this.authorhash = authorhash; assert authorhash == null || !authorhash.isEmpty();
this.author = author; assert author == null || !author.isEmpty();
this.snippetCacheStrategy = snippetCacheStrategy;
this.clienthost = host;
this.remotepeer = null;
@ -406,8 +406,14 @@ public final class QueryParams {
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(this.queryGoal.solrQueryString(this.indexSegment.fulltext().getSolrScheme()).toString());
params.setParam("defType", "edismax");
params.setParam("bq", Boost.RANKING.getBoostQuery()); // a boost query that moves double content to the back
params.setParam("bf", Boost.RANKING.getBoostFunction()); // a boost function extension
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);
// add constraints
// add site facets
final StringBuilder fq = new StringBuilder();
if (this.nav_sitehash == null && this.nav_sitehost == null) {
if (this.siteexcludes != null) {
@ -427,12 +433,10 @@ public final class QueryParams {
fq.append(" AND ").append(YaCySchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(YaCySchema.VOCABULARY_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
}
params.setParam("defType", "edismax");
params.setParam("bq", Boost.RANKING.getBoostQuery()); // a boost query that moves double content to the back
params.setParam("bf", Boost.RANKING.getBoostFunction()); // a boost function extension
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);
// add author facets
if (this.author != null && this.author.length() > 0) {
fq.append(" AND ").append(YaCySchema.author_s.getSolrFieldName()).append(":\"").append(this.author).append('\"');
}
if (!this.urlMask_isCatchall) {
String urlMaskPattern = this.urlMask.pattern();
@ -544,7 +548,7 @@ public final class QueryParams {
context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk);
context.append(this.nav_sitehash).append(asterisk);
context.append(this.siteexcludes).append(asterisk);
context.append(this.authorhash).append(asterisk);
context.append(this.author).append(asterisk);
context.append(this.targetlang).append(asterisk);
context.append(this.constraint).append(asterisk);
context.append(this.maxDistance).append(asterisk);

@ -327,42 +327,6 @@ public final class RankingProcess extends Thread {
if (!hosthash.equals(this.query.nav_sitehash)) continue pollloop;
}
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
//this.hostHashNavigator.inc(hosthash);
//this.hostHashResolver.put(hosthash, iEntry.urlhash());
// check vocabulary constraint
/*
String subject = YaCyMetadata.hashURI(iEntry.urlhash());
Resource resource = JenaTripleStore.getResource(subject);
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
// all metatags must appear in the tags list
for (Tagging.Metatag metatag: this.query.metatags) {
Iterator<RDFNode> ni = JenaTripleStore.getObjects(resource, metatag.getPredicate());
if (!ni.hasNext()) continue pollloop;
String tags = ni.next().toString();
if (tags.indexOf(metatag.getObject()) < 0) continue pollloop;
}
}
*/
// add navigators using the triplestore
/*
for (Map.Entry<String, String> v: this.taggingPredicates.entrySet()) {
Iterator<RDFNode> ni = JenaTripleStore.getObjects(resource, v.getValue());
while (ni.hasNext()) {
String[] tags = CommonPattern.COMMA.split(ni.next().toString());
for (String tag: tags) {
ScoreMap<String> voc = this.vocabularyNavigator.get(v.getKey());
if (voc == null) {
voc = new ConcurrentScoreMap<String>();
this.vocabularyNavigator.put(v.getKey(), voc);
}
voc.inc(tag);
}
}
}
*/
// finally extend the double-check and insert result to stack
this.urlhashes.putUnique(iEntry.urlhash());
rankingtryloop: while (true) {

@ -484,6 +484,11 @@ public final class SearchEvent {
if (fcts != null) this.filetypeNavigator.inc(fcts);
}
if (this.authorNavigator != null) {
fcts = facets.get(YaCySchema.author_s.getSolrFieldName());
if (fcts != null) this.authorNavigator.inc(fcts);
}
if (this.protocolNavigator != null) {
fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName());
if (fcts != null) this.protocolNavigator.inc(fcts);
@ -839,25 +844,6 @@ public final class SearchEvent {
}
}
// evaluate information of metadata for navigation
// author navigation:
if ( pageauthor != null && pageauthor.length() > 0 ) {
// add author to the author navigator
final String authorhash = ASCII.String(Word.word2hash(pageauthor));
// check if we already are filtering for authors
if ( this.query.authorhash != null && !this.query.authorhash.equals(authorhash) ) {
this.query.misses.add(page.hash());
continue;
}
// add author to the author navigator
if (this.authorNavigator != null) this.authorNavigator.inc(pageauthor);
} else if ( this.query.authorhash != null ) {
this.query.misses.add(page.hash());
continue;
}
// check Scanner
if ( !Scanner.acceptURL(page.url()) ) {
this.query.misses.add(page.hash());

Loading…
Cancel
Save