added solr faceted search support to YaCy search results

added solr highlighting / YaCy snippets to YaCy search results
- facets are now much more complete
- facets are computed and searched much faster
- snippet computation is done by solr if solr knows the snippet
pull/1/head
Michael Peter Christen 13 years ago
parent b30a7162fa
commit 2371ef031c

@ -159,7 +159,7 @@ public class AccessTracker_p {
// put values in template
prop.put("page_list_" + m + "_dark", ((dark) ? 1 : 0) );
dark =! dark;
prop.putHTML("page_list_" + m + "_host", query.host);
prop.putHTML("page_list_" + m + "_host", query.clienthost);
prop.put("page_list_" + m + "_date", GenericFormatter.SIMPLE_FORMATTER.format(new Date(query.starttime)));
prop.put("page_list_" + m + "_timestamp", query.starttime);
if (page == 2) {

@ -377,7 +377,7 @@ public class Crawler_p {
String hosthash = u.hosthash();
try {
sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash));
sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.name() + ":[* TO *]");
sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
sb.index.fulltext().commit();
} catch (IOException e) {Log.logException(e);}
}

@ -138,7 +138,7 @@ public class HostBrowser {
int maxcount = admin ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums
// collect hosts from index
ReversibleScoreMap<String> hostscore = fulltext.getSolr().getFacets("*:*", new String[]{YaCySchema.host_s.name()}, maxcount).get(YaCySchema.host_s.name());
ReversibleScoreMap<String> hostscore = fulltext.getSolr().getFacets("*:*", new String[]{YaCySchema.host_s.getSolrFieldName()}, maxcount).get(YaCySchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// collect hosts from crawler
@ -148,7 +148,7 @@ public class HostBrowser {
}
// collect the errorurls
ReversibleScoreMap<String> errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.name() + ":[* TO *]", new String[]{YaCySchema.host_s.name()}, maxcount).get(YaCySchema.host_s.name()) : null;
ReversibleScoreMap<String> errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]", new String[]{YaCySchema.host_s.getSolrFieldName()}, maxcount).get(YaCySchema.host_s.getSolrFieldName()) : null;
if (errorscore == null) errorscore = new ClusteredScoreMap<String>();
int c = 0;
@ -205,13 +205,13 @@ public class HostBrowser {
// get all files for a specific host from the index
StringBuilder q = new StringBuilder();
q.append(YaCySchema.host_s.name()).append(':').append(host);
q.append(YaCySchema.host_s.getSolrFieldName()).append(':').append(host);
if (pathparts.length > 0 && pathparts[0].length() > 0) {
for (String pe: pathparts) {
if (pe.length() > 0) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(':').append(pe);
if (pe.length() > 0) q.append(" AND ").append(YaCySchema.url_paths_sxt.getSolrFieldName()).append(':').append(pe);
}
} else {
if (facetcount > 1000 && !post.containsKey("nepr")) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(":[* TO *]");
if (facetcount > 1000 && !post.containsKey("nepr")) q.append(" AND ").append(YaCySchema.url_paths_sxt.getSolrFieldName()).append(":[* TO *]");
}
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(q.toString(), 0, 100000, 3000, 100);
SolrDocument doc;
@ -224,7 +224,7 @@ public class HostBrowser {
long timeout = System.currentTimeMillis() + 3000;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
String error = (String) doc.getFieldValue(YaCySchema.failreason_t.name());
String error = (String) doc.getFieldValue(YaCySchema.failreason_t.getSolrFieldName());
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name())));

@ -231,7 +231,6 @@ public final class search {
ContentDomain.contentdomParser(contentdom),
language,
new HashSet<Tagging.Metatag>(),
"", // no navigation
null, // no snippet computation
count,
0,
@ -242,6 +241,7 @@ public final class search {
false,
sitehash,
null,
null,
authorhash,
DigestURI.TLD_any_zone_filter,
client,
@ -296,7 +296,6 @@ public final class search {
ContentDomain.contentdomParser(contentdom),
language,
new HashSet<Tagging.Metatag>(),
"", // no navigation
null, // no snippet computation
count,
0,
@ -307,6 +306,7 @@ public final class search {
false,
sitehash,
null,
null,
authorhash,
DigestURI.TLD_any_zone_filter,
client,
@ -373,7 +373,7 @@ public final class search {
// prepare reference hints
final long timer = System.currentTimeMillis();
final ScoreMap<String> topicNavigator = theSearch.getTopicNavigator(5);
final ScoreMap<String> topicNavigator = theSearch.rankingProcess.getTopicNavigator(5);
final StringBuilder refstr = new StringBuilder(6000);
final Iterator<String> navigatorIterator = topicNavigator.keys(false);
int i = 0;

@ -600,10 +600,6 @@ public class yacysearch {
}
}
// navigation
final String navigation =
(post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "");
// the query
final Collection<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute
@ -739,7 +735,6 @@ public class yacysearch {
contentdom,
language,
metatags,
navigation,
snippetFetchStrategy,
itemsPerPage,
startRecord,
@ -751,6 +746,7 @@ public class yacysearch {
constraint,
true,
sitehash,
sitehost,
DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")),
authorhash,
DigestURI.TLD_any_zone_filter,
@ -869,8 +865,7 @@ public class yacysearch {
0,
theQuery,
suggestion,
originalUrlMask.toString(),
theQuery.navigators).toString());
originalUrlMask.toString()).toString());
prop.put("didYouMean_suggestions_" + meanCount + "_sep", "|");
meanCount++;
} catch (ConcurrentModificationException e) {break meanCollect;}
@ -947,8 +942,7 @@ public class yacysearch {
thispage - 1,
theQuery,
null,
originalUrlMask,
navigation).toString());
originalUrlMask).toString());
resnav
.append("\"><img src=\"env/grafics/navdl.gif\" alt=\"arrowleft\" width=\"16\" height=\"16\" /></a>&nbsp;");
}
@ -964,7 +958,7 @@ public class yacysearch {
} else {
resnav.append("<a href=\"");
resnav.append(QueryParams
.navurl("html", i, theQuery, null, originalUrlMask, navigation)
.navurl("html", i, theQuery, null, originalUrlMask)
.toString());
resnav.append("\"><img src=\"env/grafics/navd");
resnav.append(i + 1);
@ -983,8 +977,7 @@ public class yacysearch {
thispage + 1,
theQuery,
null,
originalUrlMask,
navigation).toString());
originalUrlMask).toString());
resnav
.append("\"><img src=\"env/grafics/navdr.gif\" alt=\"arrowright\" width=\"16\" height=\"16\" /></a>");
}

@ -100,7 +100,7 @@ public class yacysearchitem {
prop.put("remoteResourceSize", Formatter.number(theSearch.query.remote_available.get(), true));
prop.put("remoteIndexCount", Formatter.number(theSearch.query.remote_stored.get(), true));
prop.put("remotePeerCount", Formatter.number(theSearch.query.remote_peerCount.get(), true));
prop.put("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString()).toString());
final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, "");
if (theSearch.query.contentdom == Classification.ContentDomain.TEXT || theSearch.query.contentdom == Classification.ContentDomain.ALL) {
@ -217,7 +217,7 @@ public class yacysearchitem {
prop.putHTML("content_former", theSearch.query.queryString);
prop.putHTML("content_showPictures_former", theSearch.query.queryString);
final TextSnippet snippet = result.textSnippet();
final String desc = (snippet == null) ? "" : snippet.getLineMarked(theSearch.query.query_all_hashes);
final String desc = (snippet == null) ? "" : snippet.isMarked() ? snippet.getLineRaw() : snippet.getLineMarked(theSearch.query.query_all_hashes);
prop.put("content_description", desc);
prop.putXML("content_description-xml", desc);
prop.putJSON("content_description-json", desc);

@ -42,7 +42,7 @@ public class yacysearchlatestinfo {
prop.put("remoteResourceSize", Formatter.number(theSearch.query.remote_available.get(), true));
prop.put("remoteIndexCount", Formatter.number(theSearch.query.remote_stored.get(), true));
prop.put("remotePeerCount", Formatter.number(theSearch.query.remote_peerCount.get(), true));
prop.putJSON("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.putJSON("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString()).toString());
return prop;
}

@ -64,20 +64,20 @@ public class yacysearchtrailer {
// compose search navigation
// namespace navigators
final ScoreMap<String> namespaceNavigator = theSearch.getNamespaceNavigator();
String name;
int count;
Iterator<String> navigatorIterator;
if (namespaceNavigator == null || namespaceNavigator.isEmpty()) {
if (theSearch.namespaceNavigator == null || theSearch.namespaceNavigator.isEmpty()) {
prop.put("nav-namespace", 0);
} else {
prop.put("nav-namespace", 1);
navigatorIterator = namespaceNavigator.keys(false);
navigatorIterator = theSearch.namespaceNavigator.keys(false);
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 10 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = namespaceNavigator.get(name);
count = theSearch.namespaceNavigator.get(name);
if (count == 0) break;
nav = "inurl%3A" + name;
queryStringForUrl = theSearch.query.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
@ -93,7 +93,7 @@ public class yacysearchtrailer {
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
}
prop.put(fileType, "nav-namespace_element_" + i + "_name", name);
prop.put(fileType, "nav-namespace_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-namespace_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString());
prop.put("nav-namespace_element_" + i + "_count", count);
prop.put("nav-namespace_element_" + i + "_nl", 1);
i++;
@ -106,7 +106,7 @@ public class yacysearchtrailer {
}
// host navigators
final ScoreMap<String> hostNavigator = theSearch.getHostNavigator();
final ScoreMap<String> hostNavigator = theSearch.rankingProcess.getHostNavigator();
if (hostNavigator == null || hostNavigator.isEmpty()) {
prop.put("nav-domains", 0);
} else {
@ -117,6 +117,7 @@ public class yacysearchtrailer {
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = hostNavigator.get(name);
if (count == 0) break;
nav = "site%3A" + name;
queryStringForUrl = theSearch.query.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
@ -132,7 +133,7 @@ public class yacysearchtrailer {
prop.put(fileType, "nav-authors_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-domains_element_" + i + "_name", name);
prop.put(fileType, "nav-domains_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-domains_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString());
prop.put("nav-domains_element_" + i + "_count", count);
prop.put("nav-domains_element_" + i + "_nl", 1);
i++;
@ -145,17 +146,17 @@ public class yacysearchtrailer {
}
// author navigators
final ScoreMap<String> authorNavigator = theSearch.getAuthorNavigator();
if (authorNavigator == null || authorNavigator.isEmpty()) {
if (theSearch.authorNavigator == null || theSearch.authorNavigator.isEmpty()) {
prop.put("nav-authors", 0);
} else {
prop.put("nav-authors", 1);
navigatorIterator = authorNavigator.keys(false);
navigatorIterator = theSearch.authorNavigator.keys(false);
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = authorNavigator.get(name);
count = theSearch.authorNavigator.get(name);
if (count == 0) break;
nav = (name.indexOf(' ', 0) < 0) ? "author%3A" + name : "author%3A%28" + name.replace(" ", "+") + "%29";
queryStringForUrl = theSearch.query.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
@ -171,7 +172,7 @@ public class yacysearchtrailer {
prop.put(fileType, "nav-authors_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-authors_element_" + i + "_name", name);
prop.put(fileType, "nav-authors_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-authors_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString());
prop.put("nav-authors_element_" + i + "_count", count);
prop.put("nav-authors_element_" + i + "_nl", 1);
i++;
@ -184,7 +185,7 @@ public class yacysearchtrailer {
}
// topics navigator
final ScoreMap<String> topicNavigator = theSearch.getTopicNavigator(MAX_TOPWORDS);
final ScoreMap<String> topicNavigator = theSearch.rankingProcess.getTopicNavigator(MAX_TOPWORDS);
if (topicNavigator == null || topicNavigator.isEmpty()) {
prop.put("nav-topics", "0");
} else {
@ -195,13 +196,14 @@ public class yacysearchtrailer {
while (i < MAX_TOPWORDS && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = topicNavigator.get(name);
if (count == 0) break;
if (theSearch.query.queryString == null) break;
if (name != null) {
queryStringForUrl = theSearch.query.queryStringForUrl();
prop.put("nav-topics_element_" + i + "_on", 1);
prop.put(fileType, "nav-topics_element_" + i + "_modifier", name);
prop.put(fileType, "nav-topics_element_" + i + "_name", name);
prop.put(fileType, "nav-topics_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl + "+" + name, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-topics_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl + "+" + name, theSearch.query.urlMask.toString()).toString());
prop.put("nav-topics_element_" + i + "_count", count);
prop.put("nav-topics_element_" + i + "_nl", 1);
i++;
@ -213,17 +215,17 @@ public class yacysearchtrailer {
}
// protocol navigators
final ScoreMap<String> protocolNavigator = theSearch.getProtocolNavigator();
if (protocolNavigator == null || protocolNavigator.isEmpty()) {
if (theSearch.protocolNavigator == null || theSearch.protocolNavigator.isEmpty()) {
prop.put("nav-protocols", 0);
} else {
prop.put("nav-protocols", 1);
navigatorIterator = protocolNavigator.keys(false);
navigatorIterator = theSearch.protocolNavigator.keys(false);
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = protocolNavigator.get(name);
count = theSearch.protocolNavigator.get(name);
if (count == 0) break;
nav = "%2F" + name;
queryStringForUrl = theSearch.query.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
@ -239,7 +241,7 @@ public class yacysearchtrailer {
prop.put(fileType, "nav-protocols_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-protocols_element_" + i + "_name", name);
prop.put(fileType, "nav-protocols_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().startsWith(name)) ? ".*" : theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-protocols_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().startsWith(name)) ? ".*" : theSearch.query.urlMask.toString()).toString());
prop.put("nav-protocols_element_" + i + "_count", count);
prop.put("nav-protocols_element_" + i + "_nl", 1);
i++;
@ -252,17 +254,17 @@ public class yacysearchtrailer {
}
// filetype navigators
final ScoreMap<String> filetypeNavigator = theSearch.getFiletypeNavigator();
if (filetypeNavigator == null || filetypeNavigator.isEmpty()) {
if (theSearch.filetypeNavigator == null || theSearch.filetypeNavigator.isEmpty()) {
prop.put("nav-filetypes", 0);
} else {
prop.put("nav-filetypes", 1);
navigatorIterator = filetypeNavigator.keys(false);
navigatorIterator = theSearch.filetypeNavigator.keys(false);
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = filetypeNavigator.get(name);
count = theSearch.filetypeNavigator.get(name);
if (count == 0) break;
nav = "filetype%3A" + name;
queryStringForUrl = theSearch.query.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
@ -278,7 +280,7 @@ public class yacysearchtrailer {
prop.put(fileType, "nav-filetypes_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-filetypes_element_" + i + "_name", name);
prop.put(fileType, "nav-filetypes_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().endsWith(name)) ? ".*" : theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-filetypes_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().endsWith(name)) ? ".*" : theSearch.query.urlMask.toString()).toString());
prop.put("nav-filetypes_element_" + i + "_count", count);
prop.put("nav-filetypes_element_" + i + "_nl", 1);
i++;
@ -291,7 +293,7 @@ public class yacysearchtrailer {
}
// vocabulary navigators
final Map<String, ScoreMap<String>> vocabularyNavigators = theSearch.getVocabularyNavigators();
final Map<String, ScoreMap<String>> vocabularyNavigators = theSearch.rankingProcess.getVocabularyNavigators();
if (vocabularyNavigators != null && !vocabularyNavigators.isEmpty()) {
int navvoccount = 0;
vocnav: for (Map.Entry<String, ScoreMap<String>> ve: vocabularyNavigators.entrySet()) {
@ -306,6 +308,7 @@ public class yacysearchtrailer {
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = ve.getValue().get(name);
if (count == 0) break;
nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Tagging.encodePrintname(name)).toString();
queryStringForUrl = theSearch.query.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
@ -319,7 +322,7 @@ public class yacysearchtrailer {
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_name", name);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString());
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString());
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_count", count);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 1);
i++;

@ -215,6 +215,11 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
writer.write("</rss>\n".toCharArray());
}
/**
* produce snippets from solr (they call that 'highlighting')
* @param val
* @return a map from urlhashes to a list of snippets for that url
*/
@SuppressWarnings("unchecked")
public static Map<String, List<String>> highlighting(final SimpleOrderedMap<Object> val) {
Map<String, List<String>> snippets = new HashMap<String, List<String>>();

@ -708,7 +708,7 @@ public class Domains {
/**
* resolve a host address using a local DNS cache and a DNS lookup if necessary
* @param host
* @param clienthost
* @return the hosts InetAddress or null if the address cannot be resolved
*/
public static InetAddress dnsResolve(final String host0) {

@ -69,6 +69,7 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -77,6 +78,8 @@ import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.ResultURLs;
@ -112,11 +115,14 @@ import net.yacy.utils.crypt;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.response.ResultContext;
@ -1022,6 +1028,8 @@ public final class Protocol
}
}
private final static YaCySchema[] snippetFields = new YaCySchema[]{YaCySchema.h1_txt, YaCySchema.h2_txt, YaCySchema.text_t};
protected static int solrQuery(
final SearchEvent event,
final int offset,
@ -1038,6 +1046,24 @@ public final class Protocol
final SolrQuery solrQuery = event.query.solrQuery();
solrQuery.setStart(offset);
solrQuery.setRows(count);
// set facet query attributes
if (event.query.facetfields.length > 0) {
solrQuery.setFacet(true);
solrQuery.setFacetLimit(event.query.maxfacets);
solrQuery.setFacetSort(FacetParams.FACET_SORT_COUNT);
for (String field: event.query.facetfields) solrQuery.addFacetField(field);
}
// set highlightning query attributes
solrQuery.setHighlight(true);
solrQuery.setHighlightFragsize(SearchEvent.SNIPPET_MAX_LENGTH);
//solrQuery.setHighlightRequireFieldMatch();
solrQuery.setHighlightSimplePost("</b>");
solrQuery.setHighlightSimplePre("<b>");
solrQuery.setHighlightSnippets(1);
for (YaCySchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
boolean localsearch = target == null || target.equals(event.peers.mySeed());
if (localsearch) {
// search the local index
@ -1064,6 +1090,34 @@ public final class Protocol
}
}
// evaluate facets
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(event.query.facetfields.length);
for (String field: event.query.facetfields) {
FacetField facet = rsp.getFacetField(field);
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet == null ? null : facet.getValues();
if (values == null) continue;
for (Count ff: values) result.set(ff.getName(), (int) ff.getCount());
facets.put(field, result);
}
// evaluate snippets
Map<String, Map<String, List<String>>> rawsnippets = rsp.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets
Map<String, String> snippets = new HashMap<String, String>(); // this will be a list of urlhash-snippet entries
nextsnippet: for (Map.Entry<String, Map<String, List<String>>> re: rawsnippets.entrySet()) {
Map<String, List<String>> rs = re.getValue();
for (YaCySchema field: snippetFields) {
if (rs.containsKey(field.getSolrFieldName())) {
List<String> s = rs.get(field.getSolrFieldName());
if (s.size() > 0) {
snippets.put(re.getKey(), s.get(0));
continue nextsnippet;
}
}
}
// no snippet found :( --we don't assign a value here by default; that can be done as an evaluation outside this method
}
// evaluate result
List<URIMetadataNode> container = new ArrayList<URIMetadataNode>();
if (docList.size() == 0) {
@ -1126,12 +1180,12 @@ public final class Protocol
}
if (localsearch) {
event.add(container, true, "localpeer", (int) docList.getNumFound());
event.add(container, facets, snippets, true, "localpeer", (int) docList.getNumFound());
event.rankingProcess.addFinalize();
event.addExpectedRemoteReferences(-count);
Network.log.logInfo("local search (solr): localpeer sent " + container.get(0).size() + "/" + docList.size() + " references");
} else {
event.add(container, false, target.getName() + "/" + target.hash, (int) docList.getNumFound());
event.add(container, facets, snippets, false, target.getName() + "/" + target.hash, (int) docList.getNumFound());
event.rankingProcess.addFinalize();
event.addExpectedRemoteReferences(-count);
Network.log.logInfo("remote search (solr): peer " + target.getName() + " sent " + container.get(0).size() + "/" + docList.size() + " references");

@ -176,7 +176,7 @@ public class RemoteSearch extends Thread {
QueryParams.hashSet2hashString(event.query.query_exclude_hashes),
event.query.modifier,
event.query.targetlang == null ? "" : event.query.targetlang,
event.query.sitehash == null ? "" : event.query.sitehash,
event.query.nav_sitehash == null ? "" : event.query.nav_sitehash,
event.query.authorhash == null ? "" : event.query.authorhash,
event.query.contentdom == null ? "all" : event.query.contentdom.toString(),
count,

@ -318,14 +318,14 @@ public final class Fulltext implements Iterable<byte[]> {
final String host = uri.getHost();
Thread t = new Thread(){
public void run() {
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 1000000, 600000, -1);
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.getSolrFieldName() + ":" + host, 0, 1000000, 600000, -1);
try {
SolrDocument doc;
boolean removed = false;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
if (u.startsWith(path)) {
remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name())));
remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
removed = true;
}
}
@ -805,7 +805,7 @@ public final class Fulltext implements Iterable<byte[]> {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"");
Fulltext.this.solr.commit();
} catch (IOException e) {}
}

@ -379,7 +379,7 @@ public class Segment {
// STORE PAGE INDEX INTO WORD INDEX DB
int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size();
final RankingProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
final RankingProcess rankingProcess = (searchEvent == null) ? null : searchEvent.rankingProcess;
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;

@ -87,6 +87,14 @@ public final class QueryParams {
}
}
private static final String[] defaultfacetfields = new String[]{
YaCySchema.host_s.getSolrFieldName(),
YaCySchema.url_protocol_s.getSolrFieldName(),
YaCySchema.url_file_ext_s.getSolrFieldName(),
YaCySchema.author.getSolrFieldName()};
private static final int defaultmaxfacets = 30;
private static final String ampersand = "&amp;";
public static class Modifier {
@ -114,7 +122,6 @@ public final class QueryParams {
public final Classification.ContentDomain contentdom;
public final String targetlang;
protected final Collection<Tagging.Metatag> metatags;
public final String navigators;
public final Searchdom domType;
private final int zonecode;
public final int maxDistance;
@ -123,8 +130,9 @@ public final class QueryParams {
protected CacheStrategy snippetCacheStrategy;
public final RankingProfile ranking;
private final Segment indexSegment;
public final String host; // this is the client host that starts the query, not a site operator
public final String sitehash; // this is a domain hash, 6 bytes long or null
public final String clienthost; // this is the client host that starts the query, not a site operator
public final String nav_sitehost; // this is a domain name which is used to navigate to that host
public final String nav_sitehash; // this is a domain hash, 6 bytes long or null
protected final Set<String> siteexcludes; // set of domain hashes that are excluded if not included by sitehash
public final String authorhash;
public final Modifier modifier;
@ -138,6 +146,8 @@ public final class QueryParams {
public final String userAgent;
protected boolean filterfailurls;
protected double lat, lon, radius;
public String[] facetfields;
public int maxfacets;
// the following values are filled during the search process as statistics for the search
public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index
@ -197,15 +207,15 @@ public final class QueryParams {
this.constraint = constraint;
this.allofconstraint = false;
this.snippetCacheStrategy = null;
this.host = null;
this.sitehash = null;
this.clienthost = null;
this.nav_sitehash = null;
this.nav_sitehost = null;
this.siteexcludes = null;
this.authorhash = null;
this.remotepeer = null;
this.starttime = Long.valueOf(System.currentTimeMillis());
this.maxtime = 10000;
this.timeout = this.starttime + this.timeout;
this.navigators = "all";
this.indexSegment = indexSegment;
this.userAgent = userAgent;
this.transmitcount = 0;
@ -221,6 +231,8 @@ public final class QueryParams {
this.remote_available = new AtomicInteger(0); // the number of result contributions from all the remote peers
this.remote_peerCount = new AtomicInteger(0); // the number of remote peers that have contributed
this.misses = Collections.synchronizedSortedSet(new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder));
this.facetfields = defaultfacetfields;
this.maxfacets = defaultmaxfacets;
}
public QueryParams(
@ -235,12 +247,12 @@ public final class QueryParams {
final int maxDistance, final String prefer, final ContentDomain contentdom,
final String language,
final Collection<Tagging.Metatag> metatags,
final String navigators,
final CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int offset, final String urlMask,
final Searchdom domType, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint,
final String site,
final String nav_sitehash,
final String nav_sitehost,
final Set<String> siteexcludes,
final String authorhash,
final int domainzone,
@ -280,16 +292,16 @@ public final class QueryParams {
assert language != null;
this.targetlang = language;
this.metatags = metatags;
this.navigators = navigators;
this.domType = domType;
this.zonecode = domainzone;
this.constraint = constraint;
this.allofconstraint = allofconstraint;
this.sitehash = site; assert site == null || site.length() == 6;
this.nav_sitehash = nav_sitehash; assert nav_sitehash == null || nav_sitehash.length() == 6;
this.nav_sitehost = nav_sitehost;
this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null: siteexcludes;
this.authorhash = authorhash; assert authorhash == null || !authorhash.isEmpty();
this.snippetCacheStrategy = snippetCacheStrategy;
this.host = host;
this.clienthost = host;
this.remotepeer = null;
this.starttime = Long.valueOf(System.currentTimeMillis());
this.maxtime = 10000;
@ -311,6 +323,8 @@ public final class QueryParams {
this.remote_available = new AtomicInteger(0); // the number of result contributions from all the remote peers
this.remote_peerCount = new AtomicInteger(0); // the number of remote peers that have contributed
this.misses = Collections.synchronizedSortedSet(new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder));
this.facetfields = defaultfacetfields;
this.maxfacets = defaultmaxfacets;
}
private double kmNormal = 100.d; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid
@ -506,22 +520,30 @@ public final class QueryParams {
final StringBuilder q = solrQueryString(this.query_include_words, this.query_exclude_words, this.indexSegment.fulltext().getSolrScheme());
// add constraints
if ( this.sitehash == null ) {
if (this.nav_sitehash == null && this.nav_sitehost == null) {
if (this.siteexcludes != null) {
for (String ex: this.siteexcludes) {
q.append(" -").append(YaCySchema.host_id_s.name()).append(':').append(ex);
q.append(" -").append(YaCySchema.host_id_s.getSolrFieldName()).append(':').append(ex);
}
}
} else {
q.append(' ').append(YaCySchema.host_id_s.name()).append(':').append(this.sitehash);
if (this.nav_sitehost != null)
q.append(" AND ").append(YaCySchema.host_s.getSolrFieldName()).append(":\"").append(this.nav_sitehost).append('\"');
else
q.append(" AND ").append(YaCySchema.host_id_s.getSolrFieldName()).append(":\"").append(this.nav_sitehash).append('\"');
}
String urlMaskPattern = this.urlMask.pattern();
int extm = urlMaskPattern.indexOf(".*\\.");
if (extm >= 0) {
String ext = urlMaskPattern.substring(extm + 4);
q.append(" AND ").append(YaCySchema.url_file_ext_s.name()).append(':').append(ext);
q.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(':').append(ext);
}
extm = urlMaskPattern.indexOf("?://.*");
if (extm >= 0) {
String protocol = urlMaskPattern.substring(0, extm);
q.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append(protocol);
}
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(q.toString());
@ -537,13 +559,13 @@ public final class QueryParams {
//params.set("sfield", YaCySchema.coordinate_p.name());
//params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon));
//params.set("d", GeoLocation.degreeToKm(this.radius));
params.setFilterQueries("{!bbox sfield=" + YaCySchema.coordinate_p.name() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}");
params.setFilterQueries("{!bbox sfield=" + YaCySchema.coordinate_p.getSolrFieldName() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}");
//params.setRows(Integer.MAX_VALUE);
} else {
// set ranking
if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) {
// set a most-recent ordering
params.setSortField(YaCySchema.last_modified.name(), ORDER.desc);
params.setSortField(YaCySchema.last_modified.getSolrFieldName(), ORDER.desc);
}
}
@ -574,10 +596,10 @@ public final class QueryParams {
wc = 0;
Float boost;
for (YaCySchema field: fields) {
if (configuration != null && !configuration.contains(field.name())) continue;
if (configuration != null && !configuration.contains(field.getSolrFieldName())) continue;
if (wc > 0) q.append(" OR ");
q.append('(');
q.append(field.name()).append(':').append(w);
q.append(field.getSolrFieldName()).append(':').append(w);
boost = boosts.get(field);
if (boost != null) q.append('^').append(boost.toString());
q.append(')');
@ -587,7 +609,7 @@ public final class QueryParams {
q.append(')');
// add filter to prevent that results come from failed urls
q.append(" AND -").append(YaCySchema.failreason_t.name()).append(":[* TO *]");
q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
return q;
}
@ -665,7 +687,7 @@ public final class QueryParams {
context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk);
context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk);
context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk);
context.append(this.sitehash).append(asterisk);
context.append(this.nav_sitehash).append(asterisk);
context.append(this.siteexcludes).append(asterisk);
context.append(this.authorhash).append(asterisk);
context.append(this.targetlang).append(asterisk);
@ -694,9 +716,9 @@ public final class QueryParams {
*/
public static StringBuilder navurl(
final String ext, final int page, final QueryParams theQuery,
final String newQueryString, final String originalUrlMask, final String nav) {
final String newQueryString, final String originalUrlMask) {
final StringBuilder sb = navurlBase(ext, theQuery, newQueryString, originalUrlMask, nav);
final StringBuilder sb = navurlBase(ext, theQuery, newQueryString, originalUrlMask);
sb.append(ampersand);
sb.append("startRecord=");
@ -707,7 +729,7 @@ public final class QueryParams {
public static StringBuilder navurlBase(
final String ext, final QueryParams theQuery,
final String newQueryString, final String originalUrlMask, final String nav) {
final String newQueryString, final String originalUrlMask) {
final StringBuilder sb = new StringBuilder(120);
sb.append("/yacysearch.");
@ -727,10 +749,6 @@ public final class QueryParams {
sb.append("verify=");
sb.append(theQuery.snippetCacheStrategy == null ? "false" : theQuery.snippetCacheStrategy.toName());
sb.append(ampersand);
sb.append("nav=");
sb.append(nav);
sb.append(ampersand);
sb.append("urlmaskfilter=");
sb.append(originalUrlMask);

@ -89,8 +89,9 @@ public final class RankingProcess extends Thread {
protected final AtomicInteger receivedRemoteReferences;
protected final ReferenceOrder order;
protected final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
protected final ScoreMap<String> hostNavigator; // a counter for the appearance of the host hash
protected final Map<String, byte[]> hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
protected final ScoreMap<String> hostNavigator = new ConcurrentScoreMap<String>(); // a counter for the appearance of host names
protected final ScoreMap<String> hostHashNavigator; // a counter for the appearance of the host hash (this can be filled during classic remote search)
protected final Map<String, byte[]> hostHashResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
protected final Map<String, String> taggingPredicates; // a map from tagging vocabulary names to tagging predicate uris
protected final Map<String, ScoreMap<String>> vocabularyNavigator; // counters for Vocabularies; key is metatag.getVocabularyName()
private boolean remote;
@ -117,8 +118,8 @@ public final class RankingProcess extends Thread {
this.receivedRemoteReferences = new AtomicInteger(0);
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang));
this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
this.hostNavigator = new ConcurrentScoreMap<String>();
this.hostResolver = new ConcurrentHashMap<String, byte[]>();
this.hostHashNavigator = new ConcurrentScoreMap<String>();
this.hostHashResolver = new ConcurrentHashMap<String, byte[]>();
this.vocabularyNavigator = new ConcurrentHashMap<String, ScoreMap<String>>();
this.taggingPredicates = new HashMap<String, String>();
for (Tagging t: LibraryProvider.autotagging.getVocabularies()) {
@ -284,7 +285,6 @@ public final class RankingProcess extends Thread {
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
// apply all constraints
long timeout = System.currentTimeMillis() + maxtime;
@ -336,18 +336,16 @@ public final class RankingProcess extends Thread {
// check site constraints
final String hosthash = iEntry.hosthash();
if ( this.query.sitehash == null ) {
if ( this.query.nav_sitehash == null ) {
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) continue pollloop;
} else {
// filter out all domains that do not match with the site constraint
if (!hosthash.equals(this.query.sitehash)) continue pollloop;
if (!hosthash.equals(this.query.nav_sitehash)) continue pollloop;
}
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) {
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, iEntry.urlhash());
}
this.hostHashNavigator.inc(hosthash);
this.hostHashResolver.put(hosthash, iEntry.urlhash());
// check protocol
if (!this.query.urlMask_isCatchall) {
@ -420,31 +418,37 @@ public final class RankingProcess extends Thread {
return this.localSearchInclusion;
}
public ScoreMap<String> getHostNavigator() {
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts", 0) < 0 ) {
return result;
}
final Iterator<String> domhashs = this.hostNavigator.keys(false);
final Iterator<String> domhashs = this.hostHashNavigator.keys(false);
URIMetadataNode row;
byte[] urlhash;
String hosthash, hostname;
if ( this.hostResolver != null ) {
if ( this.hostHashResolver != null ) {
while ( domhashs.hasNext() && result.sizeSmaller(30) ) {
hosthash = domhashs.next();
if ( hosthash == null ) {
continue;
}
urlhash = this.hostResolver.get(hosthash);
urlhash = this.hostHashResolver.get(hosthash);
row = urlhash == null ? null : this.query.getSegment().fulltext().getMetadata(urlhash);
hostname = row == null ? null : row.url().getHost();
if ( hostname != null ) {
result.set(hostname, this.hostNavigator.get(hosthash));
result.set(hostname, this.hostHashNavigator.get(hosthash));
}
}
}
// add only navigation hosts which have more than one entry
Iterator<String> i = this.hostNavigator.keys(false);
while (i.hasNext()) {
String h = i.next();
int c = this.hostNavigator.get(h);
if (c <= 0) break;
result.inc(h, c);
}
return result;
}
@ -452,13 +456,10 @@ public final class RankingProcess extends Thread {
return this.vocabularyNavigator;
}
protected ScoreMap<String> getTopicNavigator(final int count) {
public ScoreMap<String> getTopicNavigator(final int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("topics", 0) < 0 ) {
return result;
}
if ( this.ref.sizeSmaller(2) ) {
this.ref.clear(); // navigators with one entry are not useful
}

@ -33,6 +33,7 @@ import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import com.hp.hpl.jena.rdf.model.RDFNode;
@ -41,6 +42,7 @@ import com.hp.hpl.jena.rdf.model.Resource;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.federate.yacy.Distribution;
import net.yacy.cora.lod.JenaTripleStore;
@ -48,8 +50,8 @@ import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
@ -105,10 +107,10 @@ public final class SearchEvent {
private byte[] IAmaxcounthash, IAneardhthash;
private final Thread localsearch;
private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons
private final ScoreMap<String> authorNavigator; // a counter for the appearances of authors
private final ScoreMap<String> namespaceNavigator; // a counter for name spaces
private final ScoreMap<String> protocolNavigator; // a counter for protocol types
private final ScoreMap<String> filetypeNavigator; // a counter for file types
public final ScoreMap<String> authorNavigator; // a counter for the appearances of authors
public final ScoreMap<String> namespaceNavigator; // a counter for name spaces
public final ScoreMap<String> protocolNavigator; // a counter for protocol types
public final ScoreMap<String> filetypeNavigator; // a counter for file types
protected final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack;
protected final WeakPriorityBlockingQueue<ResultEntry> result;
protected final LoaderDispatcher loader;
@ -117,6 +119,7 @@ public final class SearchEvent {
private SnippetWorker[] workerThreads;
protected long urlRetrievalAllTime;
protected long snippetComputationAllTime;
protected ConcurrentHashMap<String, String> snippets;
private final boolean remote;
private boolean cleanupState;
@ -146,7 +149,8 @@ public final class SearchEvent {
this.namespaceNavigator = new ConcurrentScoreMap<String>();
this.protocolNavigator = new ConcurrentScoreMap<String>();
this.filetypeNavigator = new ConcurrentScoreMap<String>();
this.snippets = new ConcurrentHashMap<String, String>();
this.secondarySearchSuperviser =
(this.query.query_include_hashes.size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches
if ( this.secondarySearchSuperviser != null ) {
@ -393,53 +397,6 @@ public final class SearchEvent {
return this.secondarySearchThreads;
}
public RankingProcess getRankingResult() {
return this.rankingProcess;
}
public ScoreMap<String> getHostNavigator() {
return this.rankingProcess.getHostNavigator();
}
public ScoreMap<String> getTopicNavigator(final int count) {
// returns a set of words that are computed as toplist
return this.rankingProcess.getTopicNavigator(count);
}
public ScoreMap<String> getNamespaceNavigator() {
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
return this.namespaceNavigator;
}
public ScoreMap<String> getProtocolNavigator() {
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
return this.protocolNavigator;
}
public ScoreMap<String> getFiletypeNavigator() {
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
return this.filetypeNavigator;
}
public ScoreMap<String> getAuthorNavigator() {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("authors", 0) < 0 ) {
return new ConcurrentScoreMap<String>();
}
return this.authorNavigator;
}
public Map<String,ScoreMap<String>> getVocabularyNavigators() {
return this.rankingProcess.getVocabularyNavigators();
}
public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) {
synchronized ( this.heuristics ) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
@ -452,7 +409,6 @@ public final class SearchEvent {
}
}
protected boolean workerAlive() {
if ( this.workerThreads == null ) {
return false;
@ -467,11 +423,14 @@ public final class SearchEvent {
public void add(
final List<URIMetadataNode> index,
final Map<String, ReversibleScoreMap<String>> facets, // a map from a field name to scored values
final Map<String, String> solrsnippets, // a map from urlhash to snippet text
final boolean local,
final String resourceName,
final int fullResource) {
this.rankingProcess.addBegin();
this.snippets.putAll(solrsnippets);
assert (index != null);
if (index.isEmpty()) return;
@ -494,8 +453,20 @@ public final class SearchEvent {
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0;
// collect navigation information
ReversibleScoreMap<String> fcts = facets.get(YaCySchema.host_s.getSolrFieldName());
if (fcts != null) this.rankingProcess.hostNavigator.inc(fcts);
fcts = facets.get(YaCySchema.url_file_ext_s.getSolrFieldName());
if (fcts != null) this.filetypeNavigator.inc(fcts);
fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName());
if (fcts != null) this.protocolNavigator.inc(fcts);
//fcts = facets.get(YaCySchema.author.getSolrFieldName());
//if (fcts != null) this.authorNavigator.inc(fcts);
// apply all constraints
try {
final String pattern = this.query.urlMask.pattern();
@ -535,19 +506,13 @@ public final class SearchEvent {
// check site constraints
final String hosthash = iEntry.hosthash();
if ( this.query.sitehash == null ) {
if ( this.query.nav_sitehash == null ) {
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
continue pollloop;
}
} else {
// filter out all domains that do not match with the site constraint
if (!hosthash.equals(this.query.sitehash)) continue pollloop;
}
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) {
this.rankingProcess.hostNavigator.inc(hosthash);
this.rankingProcess.hostResolver.put(hosthash, iEntry.hash());
if (!hosthash.equals(this.query.nav_sitehash)) continue pollloop;
}
// check protocol
@ -874,13 +839,6 @@ public final class SearchEvent {
// from here: collect navigation information
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
if (!this.query.navigators.isEmpty() && (this.query.urlMask_isCatchall || this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0)) {
final String hosthash = page.hosthash();
this.rankingProcess.hostNavigator.inc(hosthash);
this.rankingProcess.hostResolver.put(hosthash, page.hash());
}
// namespace navigation
String pagepath = page.url().getPath();
if ( (p = pagepath.indexOf(':')) >= 0 ) {
@ -896,10 +854,6 @@ public final class SearchEvent {
final String protocol = page.url().getProtocol();
this.protocolNavigator.inc(protocol);
// file type navigation
final String fileext = page.url().getFileExtension();
if ( fileext.length() > 0 ) this.filetypeNavigator.inc(fileext);
return page; // accept url
}
Log.logWarning("RWIProcess", "loop terminated");

@ -22,9 +22,11 @@ package net.yacy.search.query;
import java.util.Iterator;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
@ -36,6 +38,7 @@ import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.ResultEntry;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.search.snippet.TextSnippet.ResultClass;
public class SnippetWorker extends Thread {
private final SearchEvent snippetProcess;
@ -60,8 +63,6 @@ public class SnippetWorker extends Thread {
// start fetching urls and snippets
URIMetadataNode page;
ResultEntry resultEntry;
//final int fetchAhead = snippetMode == 0 ? 0 : 10;
final boolean nav_topics = this.snippetProcess.query.navigators.equals("all") || this.snippetProcess.query.navigators.indexOf("topics",0) >= 0;
try {
while (this.shallrun && System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis();
@ -97,27 +98,21 @@ public class SnippetWorker extends Thread {
continue;
}
// in case that we have an attached solr, we load also the solr document
String solrContent = page.getText();
resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) {
continue; // the entry had some problems, cannot be used
}
//if (result.contains(resultEntry)) continue;
this.snippetProcess.urlRetrievalAllTime += resultEntry.dbRetrievalTime;
this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
// apply post-ranking
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, this.snippetProcess.rankingProcess.getTopicNavigator(10));
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
resultEntry.ranking = ranking;
this.snippetProcess.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
if (nav_topics) {
this.snippetProcess.rankingProcess.addTopics(resultEntry);
}
this.snippetProcess.rankingProcess.addTopics(resultEntry);
}
if (System.currentTimeMillis() >= this.timeout) {
Log.logWarning("SnippetProcess", "worker ended with timeout");
@ -199,7 +194,7 @@ public class SnippetWorker extends Thread {
return r;
}
private ResultEntry fetchSnippet(final URIMetadataNode page, final String solrText, final CacheStrategy cacheStrategy) {
private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
@ -208,16 +203,15 @@ public class SnippetWorker extends Thread {
// load only urls if there was not yet a root url of that hash
// find the url entry
long startTime = System.currentTimeMillis();
if (page == null) {
return null;
String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash()));
if (solrsnippet != null && solrsnippet.length() > 0) {
final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0);
}
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet(
null,
solrText,
page,
this.snippetProcess.snippetFetchWordHashes,
//this.query.queryString,
@ -225,16 +219,15 @@ public class SnippetWorker extends Thread {
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
SearchEvent.SNIPPET_MAX_LENGTH,
!this.snippetProcess.query.isLocal());
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, dbRetrievalTime, 0); // result without snippet
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet
}
// load snippet
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
// attach text snippet
startTime = System.currentTimeMillis();
long startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
this.snippetProcess.loader,
solrText,
page,
this.snippetProcess.snippetFetchWordHashes,
cacheStrategy,
@ -246,16 +239,16 @@ public class SnippetWorker extends Thread {
if (!snippet.getErrorCode().fail()) {
// we loaded the file and found the snippet
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached
} else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) {
// we accept that because the word cannot be on the page
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, 0);
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0);
}
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
if (this.snippetProcess.deleteIfSnippetFail) {
@ -265,6 +258,6 @@ public class SnippetWorker extends Thread {
return null;
}
}
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, 0); // result without snippet
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet
}
}

@ -60,21 +60,20 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
private final Segment indexSegment;
// statistic objects
public long dbRetrievalTime, snippetComputationTime, ranking;
public long snippetComputationTime, ranking;
public ResultEntry(final URIMetadataNode urlentry,
final Segment indexSegment,
SeedDB peers,
final TextSnippet textSnippet,
final List<MediaSnippet> mediaSnippets,
final long dbRetrievalTime, final long snippetComputationTime) {
final long snippetComputationTime) {
this.urlentry = urlentry;
this.indexSegment = indexSegment;
this.alternative_urlstring = null;
this.alternative_urlname = null;
this.textSnippet = textSnippet;
this.mediaSnippets = mediaSnippets;
this.dbRetrievalTime = dbRetrievalTime;
this.snippetComputationTime = snippetComputationTime;
final String host = urlentry.url().getHost();
if (host != null && host.endsWith(".yacyh")) {

@ -132,20 +132,21 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
private byte[] urlhash;
private String line;
private boolean isMarked;
private String error;
private ResultClass resultStatus;
public TextSnippet(
final byte[] urlhash,
final String line,
final boolean isMarked,
final ResultClass errorCode,
final String errortext) {
init(urlhash, line, errorCode, errortext);
init(urlhash, line, isMarked, errorCode, errortext);
}
public TextSnippet(
final LoaderDispatcher loader,
final String solrText,
final URIMetadataNode row,
final HandleSet queryhashes,
final CacheStrategy cacheStrategy,
@ -153,10 +154,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
final int snippetMaxLength,
final boolean reindexing) {
// heise = "0OQUNU3JSs05"
final DigestURI url = row.url();
if (queryhashes.isEmpty()) {
//System.out.println("found no queryhashes for URL retrieve " + url);
init(url.hash(), null, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given");
init(url.hash(), null, false, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given");
return;
}
@ -167,7 +169,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
String snippetLine = snippetsCache.get(wordhashes, urls);
if (snippetLine != null) {
// found the snippet
init(url.hash(), snippetLine, source, null);
init(url.hash(), snippetLine, false, source, null);
return;
}
@ -188,6 +190,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// we did not find everything in the metadata, look further into the document itself.
// first acquire the sentences:
final String solrText = row.getText();
if (solrText != null) {
// compute sentences from solr query
final SentenceReader sr = new SentenceReader(solrText, pre);
@ -215,7 +218,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
if (sentences == null) {
// not found the snippet
init(url.hash(), null, ResultClass.SOURCE_METADATA, null);
init(url.hash(), null, false, ResultClass.SOURCE_METADATA, null);
return;
}
@ -225,7 +228,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (final UnsupportedOperationException e) {
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
return;
}
}
@ -247,7 +250,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
textline = s.toString();
}
}
init(url.hash(), textline.length() > 0 ? textline : this.line, ResultClass.SOURCE_METADATA, null);
init(url.hash(), textline.length() > 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null);
return;
}
@ -262,12 +265,12 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
init(url.hash(), null, false, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return;
}
// if it is still not available, report an error
init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
init(url.hash(), null, false, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
return;
}
@ -282,11 +285,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
init(url.hash(), null, false, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return;
}
if (document == null) {
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
init(url.hash(), null, false, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
return;
}
@ -295,7 +298,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
document.close();
if (sentences == null) {
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
init(url.hash(), null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
return;
}
@ -304,7 +307,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (final UnsupportedOperationException e) {
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
return;
}
} //encapsulate potential expensive sentences END
@ -324,7 +327,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
//if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "<br />" + textline;
if (snippetLine == null || !remainingHashes.isEmpty()) {
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
return;
}
if (snippetLine.length() > snippetMaxLength) snippetLine = snippetLine.substring(0, snippetMaxLength);
@ -333,16 +336,18 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
snippetsCache.put(wordhashes, urls, snippetLine);
// document.close();
init(url.hash(), snippetLine, source, null);
init(url.hash(), snippetLine, false, source, null);
}
private void init(
final byte[] urlhash,
final String line,
final boolean isMarked,
final ResultClass errorCode,
final String errortext) {
this.urlhash = urlhash;
this.line = line;
this.isMarked = isMarked;
this.resultStatus = errorCode;
this.error = errortext;
}
@ -351,6 +356,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return this.line != null;
}
public boolean isMarked() {
return this.isMarked;
}
public String getLineRaw() {
return (this.line == null) ? "" : this.line;
}

Loading…
Cancel
Save