- added faceted drill-down for host and geolocation to solr queries

- added a new geolocation field to index schema, the old values are
migrated if possible
pull/1/head
Michael Peter Christen 13 years ago
parent f00168ecc5
commit e8acd542b5

@ -80,6 +80,9 @@ lon_coordinate
## longitude of location as declared in WSG84, tdouble ## longitude of location as declared in WSG84, tdouble
lat_coordinate lat_coordinate
## point in degrees of latitude,longitude as declared in WSG84, location
coordinate_p
## ip of host of url (after DNS lookup), string ## ip of host of url (after DNS lookup), string
ip_s ip_s

@ -29,6 +29,7 @@ public enum SolrType {
string, string,
text_general, text_general,
text_en_splitting_tight, text_en_splitting_tight,
location,
date, date,
integer("int"), integer("int"),
bool("boolean"), bool("boolean"),

@ -92,4 +92,8 @@ public class GeoLocation extends IntegerGeoPoint implements Comparable<GeoLocati
return o1.compareTo(o2); return o1.compareTo(o2);
} }
public static int degreeToKm(double degree) {
return (int) (degree * 111.32d);
}
} }

@ -56,7 +56,7 @@ public class URIMetadataNode implements URIMetadata {
private DigestURI url; private DigestURI url;
Bitfield flags; Bitfield flags;
private final int imagec, audioc, videoc, appc; private final int imagec, audioc, videoc, appc;
private final double lon, lat; private double lat, lon;
private long ranking; // during generation of a search result this value is set private long ranking; // during generation of a search result this value is set
private final SolrDocument doc; private final SolrDocument doc;
private final String snippet; private final String snippet;
@ -84,6 +84,14 @@ public class URIMetadataNode implements URIMetadata {
this.appc = getInt(YaCySchema.videolinkscount_i); this.appc = getInt(YaCySchema.videolinkscount_i);
this.lon = getDouble(YaCySchema.lon_coordinate); this.lon = getDouble(YaCySchema.lon_coordinate);
this.lat = getDouble(YaCySchema.lat_coordinate); this.lat = getDouble(YaCySchema.lat_coordinate);
String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name());
if (latlon != null) {
int p = latlon.indexOf(',');
if (p > 0) {
this.lat = Double.parseDouble(latlon.substring(0, p));
this.lon = Double.parseDouble(latlon.substring(p + 1));
}
}
this.flags = new Bitfield(); this.flags = new Bitfield();
if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);

@ -1019,18 +1019,21 @@ public final class Protocol
public static int solrQuery( public static int solrQuery(
final SearchEvent event, final SearchEvent event,
final HandleSet wordhashes,
final int offset, final int offset,
final int count, final int count,
final long time, final long time,
final Seed target, final Seed target,
final Blacklist blacklist) { final Blacklist blacklist) {
final HandleSet wordhashes = event.getQuery().query_include_hashes;
if (event.getQuery().queryString == null || event.getQuery().queryString.length() == 0) { if (event.getQuery().queryString == null || event.getQuery().queryString.length() == 0) {
return -1; // we cannot query solr only with word hashes, there is no clear text string return -1; // we cannot query solr only with word hashes, there is no clear text string
} }
event.rankingProcess.addExpectedRemoteReferences(count); event.rankingProcess.addExpectedRemoteReferences(count);
SolrDocumentList docList = null; SolrDocumentList docList = null;
final String solrQuerystring = "{!lucene q.op=AND df=text_t}" + event.getQuery().solrQueryString(false); final String solrQuerystring = event.getQuery().solrQueryString(false);
Log.logInfo("Protocol", "SOLR QUERY: " + solrQuerystring);
boolean localsearch = target == null || target.equals(event.peers.mySeed()); boolean localsearch = target == null || target.equals(event.peers.mySeed());
if (localsearch) { if (localsearch) {
// search the local index // search the local index

@ -169,7 +169,7 @@ public class RemoteSearch extends Thread {
for (Seed s: targetPeers) omit.add(s); for (Seed s: targetPeers) omit.add(s);
Seed[] nodes = PeerSelection.selectNodeSearchTargets(event.peers, 20, omit); Seed[] nodes = PeerSelection.selectNodeSearchTargets(event.peers, 20, omit);
for (Seed s: nodes) { for (Seed s: nodes) {
solrRemoteSearch(event, count, event.getQuery().query_include_hashes, time, s, blacklist); solrRemoteSearch(event, count, time, s, blacklist);
} }
// start search to YaCy peers // start search to YaCy peers
@ -258,11 +258,11 @@ public class RemoteSearch extends Thread {
public static Thread solrRemoteSearch( public static Thread solrRemoteSearch(
final SearchEvent event, final SearchEvent event,
final int count, final int count,
final HandleSet wordhashes,
final long time, final long time,
final Seed targetPeer, final Seed targetPeer,
final Blacklist blacklist) { final Blacklist blacklist) {
// check own peer status // check own peer status
if (event.peers.mySeed() == null || event.peers.mySeed().getPublicAddress() == null) { return null; } if (event.peers.mySeed() == null || event.peers.mySeed().getPublicAddress() == null) { return null; }
@ -275,7 +275,6 @@ public class RemoteSearch extends Thread {
try { try {
int urls = Protocol.solrQuery( int urls = Protocol.solrQuery(
event, event,
wordhashes,
0, 0,
count, count,
time, time,

@ -156,12 +156,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
Date x = (Date) doc.getFieldValue(key.name()); Date x = (Date) doc.getFieldValue(key.name());
return (x == null) ? new Date(0) : x; return (x == null) ? new Date(0) : x;
} }
public Date getDate(SolrDocument doc, final YaCySchema key) { public Date getDate(SolrDocument doc, final YaCySchema key) {
Date x = (Date) doc.getFieldValue(key.name()); Date x = (Date) doc.getFieldValue(key.name());
return (x == null) ? new Date(0) : x; return (x == null) ? new Date(0) : x;
} }
/** /**
* save configuration to file and update enum SolrFields * save configuration to file and update enum SolrFields
* @throws IOException * @throws IOException
@ -186,8 +186,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
public SolrInputDocument metadata2solr(final URIMetadata md) { public SolrInputDocument metadata2solr(final URIMetadata md) {
if (md instanceof URIMetadataNode) { if (md instanceof URIMetadataNode) {
return ClientUtils.toSolrInputDocument(((URIMetadataNode) md).getDocument()); return ClientUtils.toSolrInputDocument(((URIMetadataNode) md).getDocument());
} }
final SolrInputDocument doc = new SolrInputDocument(); final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(md.url()); final DigestURI digestURI = new DigestURI(md.url());
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
@ -206,7 +206,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype())); if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, md.moddate()); if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, md.moddate());
if (allAttr || contains(YaCySchema.wordcount_i)) add(doc, YaCySchema.wordcount_i, md.wordCount()); if (allAttr || contains(YaCySchema.wordcount_i)) add(doc, YaCySchema.wordcount_i, md.wordCount());
String keywords = md.dc_subject(); String keywords = md.dc_subject();
Bitfield flags = md.flags(); Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) { if (flags.get(Condenser.flag_cat_indexof)) {
@ -214,7 +214,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof"; if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
} }
} }
if (allAttr || contains(YaCySchema.keywords)) { if (allAttr || contains(YaCySchema.keywords)) {
add(doc, YaCySchema.keywords, keywords); add(doc, YaCySchema.keywords, keywords);
} }
@ -233,7 +233,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// coordinates // coordinates
if (md.lat() != 0.0f && md.lon() != 0.0f) { if (md.lat() != 0.0f && md.lon() != 0.0f) {
if (allAttr || contains(YaCySchema.lat_coordinate)) add(doc, YaCySchema.lat_coordinate, md.lat()); if (allAttr || contains(YaCySchema.lat_coordinate)) add(doc, YaCySchema.lat_coordinate, md.lat());
if (allAttr || contains(YaCySchema.lon_coordinate)) add(doc, YaCySchema.lon_coordinate, md.lon()); if (allAttr || contains(YaCySchema.lon_coordinate)) add(doc, YaCySchema.lon_coordinate, md.lon());
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
} }
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200); if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
@ -261,10 +262,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
accText(sb, keywords); accText(sb, keywords);
add(doc, YaCySchema.text_t, sb.toString()); add(doc, YaCySchema.text_t, sb.toString());
} }
return doc; return doc;
} }
private static void accText(final StringBuilder sb, String text) { private static void accText(final StringBuilder sb, String text) {
if (text == null || text.length() == 0) return; if (text == null || text.length() == 0) return;
if (sb.length() != 0) sb.append(' '); if (sb.length() != 0) sb.append(' ');
@ -616,6 +617,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
if (allAttr || contains(YaCySchema.lat_coordinate)) add(doc, YaCySchema.lat_coordinate, yacydoc.lat()); if (allAttr || contains(YaCySchema.lat_coordinate)) add(doc, YaCySchema.lat_coordinate, yacydoc.lat());
if (allAttr || contains(YaCySchema.lon_coordinate)) add(doc, YaCySchema.lon_coordinate, yacydoc.lon()); if (allAttr || contains(YaCySchema.lon_coordinate)) add(doc, YaCySchema.lon_coordinate, yacydoc.lon());
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(yacydoc.lat()) + "," + Double.toString(yacydoc.lon()));
} }
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode()); if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode());

@ -27,11 +27,11 @@ package net.yacy.search.index;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.services.federated.solr.Schema; import net.yacy.cora.services.federated.solr.Schema;
import net.yacy.cora.services.federated.solr.SolrType; import net.yacy.cora.services.federated.solr.SolrType;
import org.apache.solr.common.SolrInputDocument;
public enum YaCySchema implements Schema { public enum YaCySchema implements Schema {
// mandatory // mandatory
@ -46,7 +46,7 @@ public enum YaCySchema implements Schema {
process_s(SolrType.string, true, true, "index creation comment"), process_s(SolrType.string, true, true, "index creation comment"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
// optional but recommended, part of index distribution // optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, "time when resource was loaded"), load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"), fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
@ -56,10 +56,11 @@ public enum YaCySchema implements Schema {
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio(); audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo(); videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.integer, true, true, "number of links to application resources"),// int lapp(); applinkscount_i(SolrType.integer, true, true, "number of links to application resources"),// int lapp();
// optional but recommended // optional but recommended
lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"), lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"), // deprecated
lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"), lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"), // deprecated
coordinate_p(SolrType.location, true, true, "point in degrees of latitude,longitude as declared in WSG84"),
ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"), ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, "content of author-tag"), author(SolrType.text_general, true, true, "content of author-tag"),
description(SolrType.text_general, true, true, "content of description-tag"), description(SolrType.text_general, true, true, "content of description-tag"),
@ -73,8 +74,8 @@ public enum YaCySchema implements Schema {
imagescount_i(SolrType.integer, true, true, "number of images"), imagescount_i(SolrType.integer, true, true, "number of images"),
responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"), responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
text_t(SolrType.text_general, true, true, "all visible text"), text_t(SolrType.text_general, true, true, "all visible text"),
// optional values // optional values
csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"), csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"), css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"), css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
@ -146,7 +147,7 @@ public enum YaCySchema implements Schema {
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"); ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions");
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type; private final SolrType type;
private final boolean indexed, stored; private final boolean indexed, stored;
@ -222,7 +223,7 @@ public enum YaCySchema implements Schema {
@Override @Override
public final String getComment() { public final String getComment() {
return this.comment; return this.comment;
} }
public final void add(final SolrInputDocument doc, final String value) { public final void add(final SolrInputDocument doc, final String value) {
doc.setField(this.getSolrFieldName(), value); doc.setField(this.getSolrFieldName(), value);

@ -48,6 +48,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.geolocation.GeoLocation;
import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
@ -60,6 +61,7 @@ import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import net.yacy.search.index.YaCySchema;
import net.yacy.search.ranking.RankingProfile; import net.yacy.search.ranking.RankingProfile;
public final class QueryParams { public final class QueryParams {
@ -323,15 +325,6 @@ public final class QueryParams {
return this.domType == Searchdom.LOCAL; return this.domType == Searchdom.LOCAL;
} }
public String solrQuery() {
if (this.query_include_words == null || this.query_include_words.size() == 0) return null;
StringBuilder sb = new StringBuilder(80);
for (String s: this.query_include_words) {sb.append('+'); sb.append(s);}
for (String s: this.query_exclude_words) {sb.append("+-"); sb.append(s);}
if (sb.length() == 0) return null;
return "text_t:" + sb.substring(1, sb.length());
}
public static HandleSet hashes2Set(final String query) { public static HandleSet hashes2Set(final String query) {
final HandleSet keyhashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); final HandleSet keyhashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
if (query != null) { if (query != null) {
@ -475,13 +468,42 @@ public final class QueryParams {
} }
public String solrQueryString(boolean urlencoded) { public String solrQueryString(boolean urlencoded) {
final StringBuilder q = new StringBuilder(); if (this.query_include_words == null || this.query_include_words.size() == 0) return null;
if (this.query_include_words != null) { final StringBuilder q = new StringBuilder(80);
for (String s: this.query_include_words) q.append(urlencoded ? '+' : ' ').append(s); q.append("{!lucene q.op=AND}");
for (String s: this.query_exclude_words) q.append(urlencoded ? "+-" : " -").append(s);
// add text query
q.append("text_t:");
int wc = 0;
for (String s: this.query_include_words) {
if (wc > 0) q.append(urlencoded ? '+' : ' ');
q.append(s);
wc++;
} }
if (urlencoded) return CharacterCoding.unicode2html(q.length() > 0 ? q.substring(1) : q.toString(), true); for (String s: this.query_exclude_words){
return q.length() > 0 ? q.substring(1) : q.toString(); if (wc > 0) q.append(urlencoded ? "+-" : " -");
q.append(s);
wc++;
}
// add constraints
if ( this.sitehash == null ) {
if (this.siteexcludes != null) {
for (String ex: this.siteexcludes) {
q.append(urlencoded ? "+AND+-host_id_s:" : " AND -host_id_s:").append(ex);
}
}
} else {
q.append(urlencoded ? "+AND+host_id_s:" : " AND host_id_s:").append(this.sitehash);
}
if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) {
q.append("&fq={!bbox sfield=").append(YaCySchema.coordinate_p.name()).append("}&pt=");
q.append(Double.toString(this.lat)).append(',').append(Double.toString(this.lon)).append("&d=").append(GeoLocation.degreeToKm(this.radius));
}
// prepare result
return (urlencoded) ? CharacterCoding.unicode2html(q.toString(), true) : q.toString();
} }
public String queryStringForUrl() { public String queryStringForUrl() {

@ -150,7 +150,7 @@ public final class SearchEvent {
this.rankingProcess = new RWIProcess(this.query, this.order, remote); this.rankingProcess = new RWIProcess(this.query, this.order, remote);
// start a local solr search // start a local solr search
RemoteSearch.solrRemoteSearch(this, 100, this.query.query_include_hashes, 10000, null, Switchboard.urlBlacklist); RemoteSearch.solrRemoteSearch(this, 100, 10000, null /*this peer*/, Switchboard.urlBlacklist);
// start a local RWI search concurrently // start a local RWI search concurrently
this.rankingProcess.start(); this.rankingProcess.start();

Loading…
Cancel
Save