diff --git a/htroot/AccessTracker_p.java b/htroot/AccessTracker_p.java index c17ad357f..5c9c2987e 100644 --- a/htroot/AccessTracker_p.java +++ b/htroot/AccessTracker_p.java @@ -159,7 +159,7 @@ public class AccessTracker_p { // put values in template prop.put("page_list_" + m + "_dark", ((dark) ? 1 : 0) ); dark =! dark; - prop.putHTML("page_list_" + m + "_host", query.host); + prop.putHTML("page_list_" + m + "_host", query.clienthost); prop.put("page_list_" + m + "_date", GenericFormatter.SIMPLE_FORMATTER.format(new Date(query.starttime))); prop.put("page_list_" + m + "_timestamp", query.starttime); if (page == 2) { diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 86c43357a..6356cebb6 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -377,7 +377,7 @@ public class Crawler_p { String hosthash = u.hosthash(); try { sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash)); - sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.name() + ":[* TO *]"); + sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]"); sb.index.fulltext().commit(); } catch (IOException e) {Log.logException(e);} } diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 523b37d09..698bcafb8 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -138,7 +138,7 @@ public class HostBrowser { int maxcount = admin ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums // collect hosts from index - ReversibleScoreMap hostscore = fulltext.getSolr().getFacets("*:*", new String[]{YaCySchema.host_s.name()}, maxcount).get(YaCySchema.host_s.name()); + ReversibleScoreMap hostscore = fulltext.getSolr().getFacets("*:*", new String[]{YaCySchema.host_s.getSolrFieldName()}, maxcount).get(YaCySchema.host_s.getSolrFieldName()); if (hostscore == null) hostscore = new ClusteredScoreMap(); // collect hosts from crawler @@ -148,7 +148,7 @@ public class HostBrowser { } // collect the errorurls - ReversibleScoreMap errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.name() + ":[* TO *]", new String[]{YaCySchema.host_s.name()}, maxcount).get(YaCySchema.host_s.name()) : null; + ReversibleScoreMap errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]", new String[]{YaCySchema.host_s.getSolrFieldName()}, maxcount).get(YaCySchema.host_s.getSolrFieldName()) : null; if (errorscore == null) errorscore = new ClusteredScoreMap(); int c = 0; @@ -205,13 +205,13 @@ public class HostBrowser { // get all files for a specific host from the index StringBuilder q = new StringBuilder(); - q.append(YaCySchema.host_s.name()).append(':').append(host); + q.append(YaCySchema.host_s.getSolrFieldName()).append(':').append(host); if (pathparts.length > 0 && pathparts[0].length() > 0) { for (String pe: pathparts) { - if (pe.length() > 0) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(':').append(pe); + if (pe.length() > 0) q.append(" AND ").append(YaCySchema.url_paths_sxt.getSolrFieldName()).append(':').append(pe); } } else { - if (facetcount > 1000 && !post.containsKey("nepr")) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(":[* TO *]"); + if (facetcount > 1000 && !post.containsKey("nepr")) q.append(" AND ").append(YaCySchema.url_paths_sxt.getSolrFieldName()).append(":[* TO *]"); } BlockingQueue docs = fulltext.getSolr().concurrentQuery(q.toString(), 0, 100000, 3000, 100); SolrDocument doc; @@ -224,7 +224,7 @@ public class HostBrowser { long timeout = System.currentTimeMillis() + 3000; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); - String error = (String) doc.getFieldValue(YaCySchema.failreason_t.name()); + String error = (String) doc.getFieldValue(YaCySchema.failreason_t.getSolrFieldName()); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name()))); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 3e668383a..5d76c7b1b 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -231,7 +231,6 @@ public final class search { ContentDomain.contentdomParser(contentdom), language, new HashSet(), - "", // no navigation null, // no snippet computation count, 0, @@ -242,6 +241,7 @@ public final class search { false, sitehash, null, + null, authorhash, DigestURI.TLD_any_zone_filter, client, @@ -296,7 +296,6 @@ public final class search { ContentDomain.contentdomParser(contentdom), language, new HashSet(), - "", // no navigation null, // no snippet computation count, 0, @@ -307,6 +306,7 @@ public final class search { false, sitehash, null, + null, authorhash, DigestURI.TLD_any_zone_filter, client, @@ -373,7 +373,7 @@ public final class search { // prepare reference hints final long timer = System.currentTimeMillis(); - final ScoreMap topicNavigator = theSearch.getTopicNavigator(5); + final ScoreMap topicNavigator = theSearch.rankingProcess.getTopicNavigator(5); final StringBuilder refstr = new StringBuilder(6000); final Iterator navigatorIterator = topicNavigator.keys(false); int i = 0; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 96d308449..811457f96 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -600,10 +600,6 @@ public class yacysearch { } } - // navigation - final String navigation = - (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", ""); - // the query final Collection[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute @@ -739,7 +735,6 @@ public class yacysearch { contentdom, language, metatags, - navigation, snippetFetchStrategy, itemsPerPage, startRecord, @@ -751,6 +746,7 @@ public class yacysearch { constraint, true, sitehash, + sitehost, DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")), authorhash, DigestURI.TLD_any_zone_filter, @@ -869,8 +865,7 @@ public class yacysearch { 0, theQuery, suggestion, - originalUrlMask.toString(), - theQuery.navigators).toString()); + originalUrlMask.toString()).toString()); prop.put("didYouMean_suggestions_" + meanCount + "_sep", "|"); meanCount++; } catch (ConcurrentModificationException e) {break meanCollect;} @@ -947,8 +942,7 @@ public class yacysearch { thispage - 1, theQuery, null, - originalUrlMask, - navigation).toString()); + originalUrlMask).toString()); resnav .append("\">\"arrowleft\" "); } @@ -964,7 +958,7 @@ public class yacysearch { } else { resnav.append("\"arrowright\""); } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 827469f04..e7433aa77 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -100,7 +100,7 @@ public class yacysearchitem { prop.put("remoteResourceSize", Formatter.number(theSearch.query.remote_available.get(), true)); prop.put("remoteIndexCount", Formatter.number(theSearch.query.remote_stored.get(), true)); prop.put("remotePeerCount", Formatter.number(theSearch.query.remote_peerCount.get(), true)); - prop.put("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString()).toString()); final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, ""); if (theSearch.query.contentdom == Classification.ContentDomain.TEXT || theSearch.query.contentdom == Classification.ContentDomain.ALL) { @@ -217,7 +217,7 @@ public class yacysearchitem { prop.putHTML("content_former", theSearch.query.queryString); prop.putHTML("content_showPictures_former", theSearch.query.queryString); final TextSnippet snippet = result.textSnippet(); - final String desc = (snippet == null) ? "" : snippet.getLineMarked(theSearch.query.query_all_hashes); + final String desc = (snippet == null) ? "" : snippet.isMarked() ? snippet.getLineRaw() : snippet.getLineMarked(theSearch.query.query_all_hashes); prop.put("content_description", desc); prop.putXML("content_description-xml", desc); prop.putJSON("content_description-json", desc); diff --git a/htroot/yacysearchlatestinfo.java b/htroot/yacysearchlatestinfo.java index 1ac3e3d61..4b963b495 100644 --- a/htroot/yacysearchlatestinfo.java +++ b/htroot/yacysearchlatestinfo.java @@ -42,7 +42,7 @@ public class yacysearchlatestinfo { prop.put("remoteResourceSize", Formatter.number(theSearch.query.remote_available.get(), true)); prop.put("remoteIndexCount", Formatter.number(theSearch.query.remote_stored.get(), true)); prop.put("remotePeerCount", Formatter.number(theSearch.query.remote_peerCount.get(), true)); - prop.putJSON("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.putJSON("navurlBase", QueryParams.navurlBase("html", theSearch.query, null, theSearch.query.urlMask.toString()).toString()); return prop; } diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 7044874f1..a4d551f80 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -64,20 +64,20 @@ public class yacysearchtrailer { // compose search navigation // namespace navigators - final ScoreMap namespaceNavigator = theSearch.getNamespaceNavigator(); String name; int count; Iterator navigatorIterator; - if (namespaceNavigator == null || namespaceNavigator.isEmpty()) { + if (theSearch.namespaceNavigator == null || theSearch.namespaceNavigator.isEmpty()) { prop.put("nav-namespace", 0); } else { prop.put("nav-namespace", 1); - navigatorIterator = namespaceNavigator.keys(false); + navigatorIterator = theSearch.namespaceNavigator.keys(false); int i = 0, p, pos = 0, neg = 0; String nav, queryStringForUrl; while (i < 10 && navigatorIterator.hasNext()) { name = navigatorIterator.next(); - count = namespaceNavigator.get(name); + count = theSearch.namespaceNavigator.get(name); + if (count == 0) break; nav = "inurl%3A" + name; queryStringForUrl = theSearch.query.queryStringForUrl(); p = queryStringForUrl.indexOf(nav); @@ -93,7 +93,7 @@ public class yacysearchtrailer { queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim(); } prop.put(fileType, "nav-namespace_element_" + i + "_name", name); - prop.put(fileType, "nav-namespace_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-namespace_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString()); prop.put("nav-namespace_element_" + i + "_count", count); prop.put("nav-namespace_element_" + i + "_nl", 1); i++; @@ -106,7 +106,7 @@ public class yacysearchtrailer { } // host navigators - final ScoreMap hostNavigator = theSearch.getHostNavigator(); + final ScoreMap hostNavigator = theSearch.rankingProcess.getHostNavigator(); if (hostNavigator == null || hostNavigator.isEmpty()) { prop.put("nav-domains", 0); } else { @@ -117,6 +117,7 @@ public class yacysearchtrailer { while (i < 20 && navigatorIterator.hasNext()) { name = navigatorIterator.next(); count = hostNavigator.get(name); + if (count == 0) break; nav = "site%3A" + name; queryStringForUrl = theSearch.query.queryStringForUrl(); p = queryStringForUrl.indexOf(nav); @@ -132,7 +133,7 @@ public class yacysearchtrailer { prop.put(fileType, "nav-authors_element_" + i + "_modifier", "-" + nav); } prop.put(fileType, "nav-domains_element_" + i + "_name", name); - prop.put(fileType, "nav-domains_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-domains_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString()); prop.put("nav-domains_element_" + i + "_count", count); prop.put("nav-domains_element_" + i + "_nl", 1); i++; @@ -145,17 +146,17 @@ public class yacysearchtrailer { } // author navigators - final ScoreMap authorNavigator = theSearch.getAuthorNavigator(); - if (authorNavigator == null || authorNavigator.isEmpty()) { + if (theSearch.authorNavigator == null || theSearch.authorNavigator.isEmpty()) { prop.put("nav-authors", 0); } else { prop.put("nav-authors", 1); - navigatorIterator = authorNavigator.keys(false); + navigatorIterator = theSearch.authorNavigator.keys(false); int i = 0, p, pos = 0, neg = 0; String nav, queryStringForUrl; while (i < 20 && navigatorIterator.hasNext()) { name = navigatorIterator.next().trim(); - count = authorNavigator.get(name); + count = theSearch.authorNavigator.get(name); + if (count == 0) break; nav = (name.indexOf(' ', 0) < 0) ? "author%3A" + name : "author%3A%28" + name.replace(" ", "+") + "%29"; queryStringForUrl = theSearch.query.queryStringForUrl(); p = queryStringForUrl.indexOf(nav); @@ -171,7 +172,7 @@ public class yacysearchtrailer { prop.put(fileType, "nav-authors_element_" + i + "_modifier", "-" + nav); } prop.put(fileType, "nav-authors_element_" + i + "_name", name); - prop.put(fileType, "nav-authors_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-authors_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString()); prop.put("nav-authors_element_" + i + "_count", count); prop.put("nav-authors_element_" + i + "_nl", 1); i++; @@ -184,7 +185,7 @@ public class yacysearchtrailer { } // topics navigator - final ScoreMap topicNavigator = theSearch.getTopicNavigator(MAX_TOPWORDS); + final ScoreMap topicNavigator = theSearch.rankingProcess.getTopicNavigator(MAX_TOPWORDS); if (topicNavigator == null || topicNavigator.isEmpty()) { prop.put("nav-topics", "0"); } else { @@ -195,13 +196,14 @@ public class yacysearchtrailer { while (i < MAX_TOPWORDS && navigatorIterator.hasNext()) { name = navigatorIterator.next(); count = topicNavigator.get(name); + if (count == 0) break; if (theSearch.query.queryString == null) break; if (name != null) { queryStringForUrl = theSearch.query.queryStringForUrl(); prop.put("nav-topics_element_" + i + "_on", 1); prop.put(fileType, "nav-topics_element_" + i + "_modifier", name); prop.put(fileType, "nav-topics_element_" + i + "_name", name); - prop.put(fileType, "nav-topics_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl + "+" + name, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-topics_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl + "+" + name, theSearch.query.urlMask.toString()).toString()); prop.put("nav-topics_element_" + i + "_count", count); prop.put("nav-topics_element_" + i + "_nl", 1); i++; @@ -213,17 +215,17 @@ public class yacysearchtrailer { } // protocol navigators - final ScoreMap protocolNavigator = theSearch.getProtocolNavigator(); - if (protocolNavigator == null || protocolNavigator.isEmpty()) { + if (theSearch.protocolNavigator == null || theSearch.protocolNavigator.isEmpty()) { prop.put("nav-protocols", 0); } else { prop.put("nav-protocols", 1); - navigatorIterator = protocolNavigator.keys(false); + navigatorIterator = theSearch.protocolNavigator.keys(false); int i = 0, p, pos = 0, neg = 0; String nav, queryStringForUrl; while (i < 20 && navigatorIterator.hasNext()) { name = navigatorIterator.next().trim(); - count = protocolNavigator.get(name); + count = theSearch.protocolNavigator.get(name); + if (count == 0) break; nav = "%2F" + name; queryStringForUrl = theSearch.query.queryStringForUrl(); p = queryStringForUrl.indexOf(nav); @@ -239,7 +241,7 @@ public class yacysearchtrailer { prop.put(fileType, "nav-protocols_element_" + i + "_modifier", "-" + nav); } prop.put(fileType, "nav-protocols_element_" + i + "_name", name); - prop.put(fileType, "nav-protocols_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().startsWith(name)) ? ".*" : theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-protocols_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().startsWith(name)) ? ".*" : theSearch.query.urlMask.toString()).toString()); prop.put("nav-protocols_element_" + i + "_count", count); prop.put("nav-protocols_element_" + i + "_nl", 1); i++; @@ -252,17 +254,17 @@ public class yacysearchtrailer { } // filetype navigators - final ScoreMap filetypeNavigator = theSearch.getFiletypeNavigator(); - if (filetypeNavigator == null || filetypeNavigator.isEmpty()) { + if (theSearch.filetypeNavigator == null || theSearch.filetypeNavigator.isEmpty()) { prop.put("nav-filetypes", 0); } else { prop.put("nav-filetypes", 1); - navigatorIterator = filetypeNavigator.keys(false); + navigatorIterator = theSearch.filetypeNavigator.keys(false); int i = 0, p, pos = 0, neg = 0; String nav, queryStringForUrl; while (i < 20 && navigatorIterator.hasNext()) { name = navigatorIterator.next().trim(); - count = filetypeNavigator.get(name); + count = theSearch.filetypeNavigator.get(name); + if (count == 0) break; nav = "filetype%3A" + name; queryStringForUrl = theSearch.query.queryStringForUrl(); p = queryStringForUrl.indexOf(nav); @@ -278,7 +280,7 @@ public class yacysearchtrailer { prop.put(fileType, "nav-filetypes_element_" + i + "_modifier", "-" + nav); } prop.put(fileType, "nav-filetypes_element_" + i + "_name", name); - prop.put(fileType, "nav-filetypes_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().endsWith(name)) ? ".*" : theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-filetypes_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, (p >= 0 && theSearch.query.urlMask.toString().endsWith(name)) ? ".*" : theSearch.query.urlMask.toString()).toString()); prop.put("nav-filetypes_element_" + i + "_count", count); prop.put("nav-filetypes_element_" + i + "_nl", 1); i++; @@ -291,7 +293,7 @@ public class yacysearchtrailer { } // vocabulary navigators - final Map> vocabularyNavigators = theSearch.getVocabularyNavigators(); + final Map> vocabularyNavigators = theSearch.rankingProcess.getVocabularyNavigators(); if (vocabularyNavigators != null && !vocabularyNavigators.isEmpty()) { int navvoccount = 0; vocnav: for (Map.Entry> ve: vocabularyNavigators.entrySet()) { @@ -306,6 +308,7 @@ public class yacysearchtrailer { while (i < 20 && navigatorIterator.hasNext()) { name = navigatorIterator.next(); count = ve.getValue().get(name); + if (count == 0) break; nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Tagging.encodePrintname(name)).toString(); queryStringForUrl = theSearch.query.queryStringForUrl(); p = queryStringForUrl.indexOf(nav); @@ -319,7 +322,7 @@ public class yacysearchtrailer { prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", "-" + nav); } prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_name", name); - prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString(), theSearch.query.navigators).toString()); + prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theSearch.query, queryStringForUrl, theSearch.query.urlMask.toString()).toString()); prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_count", count); prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 1); i++; diff --git a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java index f3176eb10..0e806c9fa 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java @@ -215,6 +215,11 @@ public class OpensearchResponseWriter implements QueryResponseWriter { writer.write("\n".toCharArray()); } + /** + * produce snippets from solr (they call that 'highlighting') + * @param val + * @return a map from urlhashes to a list of snippets for that url + */ @SuppressWarnings("unchecked") public static Map> highlighting(final SimpleOrderedMap val) { Map> snippets = new HashMap>(); diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 224157e29..4ec094477 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -708,7 +708,7 @@ public class Domains { /** * resolve a host address using a local DNS cache and a DNS lookup if necessary - * @param host + * @param clienthost * @return the hosts InetAddress or null if the address cannot be resolved */ public static InetAddress dnsResolve(final String host0) { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index d8470c00e..0063d382b 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -69,6 +69,7 @@ import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.opensearch.SRURSSConnector; +import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -77,6 +78,8 @@ import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.cora.sorting.ClusteredScoreMap; +import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.ResultURLs; @@ -112,11 +115,14 @@ import net.yacy.utils.crypt; import org.apache.http.entity.mime.content.ContentBody; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.FacetField; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.response.FacetField.Count; import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.response.ResultContext; @@ -1022,6 +1028,8 @@ public final class Protocol } } + private final static YaCySchema[] snippetFields = new YaCySchema[]{YaCySchema.h1_txt, YaCySchema.h2_txt, YaCySchema.text_t}; + protected static int solrQuery( final SearchEvent event, final int offset, @@ -1038,6 +1046,24 @@ public final class Protocol final SolrQuery solrQuery = event.query.solrQuery(); solrQuery.setStart(offset); solrQuery.setRows(count); + + // set facet query attributes + if (event.query.facetfields.length > 0) { + solrQuery.setFacet(true); + solrQuery.setFacetLimit(event.query.maxfacets); + solrQuery.setFacetSort(FacetParams.FACET_SORT_COUNT); + for (String field: event.query.facetfields) solrQuery.addFacetField(field); + } + + // set highlightning query attributes + solrQuery.setHighlight(true); + solrQuery.setHighlightFragsize(SearchEvent.SNIPPET_MAX_LENGTH); + //solrQuery.setHighlightRequireFieldMatch(); + solrQuery.setHighlightSimplePost(""); + solrQuery.setHighlightSimplePre(""); + solrQuery.setHighlightSnippets(1); + for (YaCySchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName()); + boolean localsearch = target == null || target.equals(event.peers.mySeed()); if (localsearch) { // search the local index @@ -1064,6 +1090,34 @@ public final class Protocol } } + // evaluate facets + Map> facets = new HashMap>(event.query.facetfields.length); + for (String field: event.query.facetfields) { + FacetField facet = rsp.getFacetField(field); + ReversibleScoreMap result = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); + List values = facet == null ? null : facet.getValues(); + if (values == null) continue; + for (Count ff: values) result.set(ff.getName(), (int) ff.getCount()); + facets.put(field, result); + } + + // evaluate snippets + Map>> rawsnippets = rsp.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets + Map snippets = new HashMap(); // this will be a list of urlhash-snippet entries + nextsnippet: for (Map.Entry>> re: rawsnippets.entrySet()) { + Map> rs = re.getValue(); + for (YaCySchema field: snippetFields) { + if (rs.containsKey(field.getSolrFieldName())) { + List s = rs.get(field.getSolrFieldName()); + if (s.size() > 0) { + snippets.put(re.getKey(), s.get(0)); + continue nextsnippet; + } + } + } + // no snippet found :( --we don't assign a value here by default; that can be done as an evaluation outside this method + } + // evaluate result List container = new ArrayList(); if (docList.size() == 0) { @@ -1126,12 +1180,12 @@ public final class Protocol } if (localsearch) { - event.add(container, true, "localpeer", (int) docList.getNumFound()); + event.add(container, facets, snippets, true, "localpeer", (int) docList.getNumFound()); event.rankingProcess.addFinalize(); event.addExpectedRemoteReferences(-count); Network.log.logInfo("local search (solr): localpeer sent " + container.get(0).size() + "/" + docList.size() + " references"); } else { - event.add(container, false, target.getName() + "/" + target.hash, (int) docList.getNumFound()); + event.add(container, facets, snippets, false, target.getName() + "/" + target.hash, (int) docList.getNumFound()); event.rankingProcess.addFinalize(); event.addExpectedRemoteReferences(-count); Network.log.logInfo("remote search (solr): peer " + target.getName() + " sent " + container.get(0).size() + "/" + docList.size() + " references"); diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 182ed377e..b62b620f1 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -176,7 +176,7 @@ public class RemoteSearch extends Thread { QueryParams.hashSet2hashString(event.query.query_exclude_hashes), event.query.modifier, event.query.targetlang == null ? "" : event.query.targetlang, - event.query.sitehash == null ? "" : event.query.sitehash, + event.query.nav_sitehash == null ? "" : event.query.nav_sitehash, event.query.authorhash == null ? "" : event.query.authorhash, event.query.contentdom == null ? "all" : event.query.contentdom.toString(), count, diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 22bd3e256..079edffb8 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -318,14 +318,14 @@ public final class Fulltext implements Iterable { final String host = uri.getHost(); Thread t = new Thread(){ public void run() { - final BlockingQueue docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 1000000, 600000, -1); + final BlockingQueue docs = getSolr().concurrentQuery(YaCySchema.host_s.getSolrFieldName() + ":" + host, 0, 1000000, 600000, -1); try { SolrDocument doc; boolean removed = false; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); if (u.startsWith(path)) { - remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name()))); + remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); removed = true; } } @@ -805,7 +805,7 @@ public final class Fulltext implements Iterable { // delete in solr synchronized (Fulltext.this.solr) { try { - Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\""); + Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\""); Fulltext.this.solr.commit(); } catch (IOException e) {} } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f187dfd70..b6e981fe4 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -379,7 +379,7 @@ public class Segment { // STORE PAGE INDEX INTO WORD INDEX DB int outlinksSame = document.inboundLinks().size(); int outlinksOther = document.outboundLinks().size(); - final RankingProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult(); + final RankingProcess rankingProcess = (searchEvent == null) ? null : searchEvent.rankingProcess; final int urlLength = urlNormalform.length(); final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 45737bb15..dccbb2d90 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -87,6 +87,14 @@ public final class QueryParams { } } + private static final String[] defaultfacetfields = new String[]{ + YaCySchema.host_s.getSolrFieldName(), + YaCySchema.url_protocol_s.getSolrFieldName(), + YaCySchema.url_file_ext_s.getSolrFieldName(), + YaCySchema.author.getSolrFieldName()}; + + private static final int defaultmaxfacets = 30; + private static final String ampersand = "&"; public static class Modifier { @@ -114,7 +122,6 @@ public final class QueryParams { public final Classification.ContentDomain contentdom; public final String targetlang; protected final Collection metatags; - public final String navigators; public final Searchdom domType; private final int zonecode; public final int maxDistance; @@ -123,8 +130,9 @@ public final class QueryParams { protected CacheStrategy snippetCacheStrategy; public final RankingProfile ranking; private final Segment indexSegment; - public final String host; // this is the client host that starts the query, not a site operator - public final String sitehash; // this is a domain hash, 6 bytes long or null + public final String clienthost; // this is the client host that starts the query, not a site operator + public final String nav_sitehost; // this is a domain name which is used to navigate to that host + public final String nav_sitehash; // this is a domain hash, 6 bytes long or null protected final Set siteexcludes; // set of domain hashes that are excluded if not included by sitehash public final String authorhash; public final Modifier modifier; @@ -138,6 +146,8 @@ public final class QueryParams { public final String userAgent; protected boolean filterfailurls; protected double lat, lon, radius; + public String[] facetfields; + public int maxfacets; // the following values are filled during the search process as statistics for the search public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index @@ -197,15 +207,15 @@ public final class QueryParams { this.constraint = constraint; this.allofconstraint = false; this.snippetCacheStrategy = null; - this.host = null; - this.sitehash = null; + this.clienthost = null; + this.nav_sitehash = null; + this.nav_sitehost = null; this.siteexcludes = null; this.authorhash = null; this.remotepeer = null; this.starttime = Long.valueOf(System.currentTimeMillis()); this.maxtime = 10000; this.timeout = this.starttime + this.timeout; - this.navigators = "all"; this.indexSegment = indexSegment; this.userAgent = userAgent; this.transmitcount = 0; @@ -221,6 +231,8 @@ public final class QueryParams { this.remote_available = new AtomicInteger(0); // the number of result contributions from all the remote peers this.remote_peerCount = new AtomicInteger(0); // the number of remote peers that have contributed this.misses = Collections.synchronizedSortedSet(new TreeSet(URIMetadataRow.rowdef.objectOrder)); + this.facetfields = defaultfacetfields; + this.maxfacets = defaultmaxfacets; } public QueryParams( @@ -235,12 +247,12 @@ public final class QueryParams { final int maxDistance, final String prefer, final ContentDomain contentdom, final String language, final Collection metatags, - final String navigators, final CacheStrategy snippetCacheStrategy, final int itemsPerPage, final int offset, final String urlMask, final Searchdom domType, final int domMaxTargets, final Bitfield constraint, final boolean allofconstraint, - final String site, + final String nav_sitehash, + final String nav_sitehost, final Set siteexcludes, final String authorhash, final int domainzone, @@ -280,16 +292,16 @@ public final class QueryParams { assert language != null; this.targetlang = language; this.metatags = metatags; - this.navigators = navigators; this.domType = domType; this.zonecode = domainzone; this.constraint = constraint; this.allofconstraint = allofconstraint; - this.sitehash = site; assert site == null || site.length() == 6; + this.nav_sitehash = nav_sitehash; assert nav_sitehash == null || nav_sitehash.length() == 6; + this.nav_sitehost = nav_sitehost; this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null: siteexcludes; this.authorhash = authorhash; assert authorhash == null || !authorhash.isEmpty(); this.snippetCacheStrategy = snippetCacheStrategy; - this.host = host; + this.clienthost = host; this.remotepeer = null; this.starttime = Long.valueOf(System.currentTimeMillis()); this.maxtime = 10000; @@ -311,6 +323,8 @@ public final class QueryParams { this.remote_available = new AtomicInteger(0); // the number of result contributions from all the remote peers this.remote_peerCount = new AtomicInteger(0); // the number of remote peers that have contributed this.misses = Collections.synchronizedSortedSet(new TreeSet(URIMetadataRow.rowdef.objectOrder)); + this.facetfields = defaultfacetfields; + this.maxfacets = defaultmaxfacets; } private double kmNormal = 100.d; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid @@ -506,22 +520,30 @@ public final class QueryParams { final StringBuilder q = solrQueryString(this.query_include_words, this.query_exclude_words, this.indexSegment.fulltext().getSolrScheme()); // add constraints - if ( this.sitehash == null ) { + if (this.nav_sitehash == null && this.nav_sitehost == null) { if (this.siteexcludes != null) { for (String ex: this.siteexcludes) { - q.append(" -").append(YaCySchema.host_id_s.name()).append(':').append(ex); + q.append(" -").append(YaCySchema.host_id_s.getSolrFieldName()).append(':').append(ex); } } } else { - q.append(' ').append(YaCySchema.host_id_s.name()).append(':').append(this.sitehash); + if (this.nav_sitehost != null) + q.append(" AND ").append(YaCySchema.host_s.getSolrFieldName()).append(":\"").append(this.nav_sitehost).append('\"'); + else + q.append(" AND ").append(YaCySchema.host_id_s.getSolrFieldName()).append(":\"").append(this.nav_sitehash).append('\"'); } String urlMaskPattern = this.urlMask.pattern(); int extm = urlMaskPattern.indexOf(".*\\."); if (extm >= 0) { String ext = urlMaskPattern.substring(extm + 4); - q.append(" AND ").append(YaCySchema.url_file_ext_s.name()).append(':').append(ext); + q.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(':').append(ext); } - + extm = urlMaskPattern.indexOf("?://.*"); + if (extm >= 0) { + String protocol = urlMaskPattern.substring(0, extm); + q.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append(protocol); + } + // construct query final SolrQuery params = new SolrQuery(); params.setQuery(q.toString()); @@ -537,13 +559,13 @@ public final class QueryParams { //params.set("sfield", YaCySchema.coordinate_p.name()); //params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon)); //params.set("d", GeoLocation.degreeToKm(this.radius)); - params.setFilterQueries("{!bbox sfield=" + YaCySchema.coordinate_p.name() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}"); + params.setFilterQueries("{!bbox sfield=" + YaCySchema.coordinate_p.getSolrFieldName() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}"); //params.setRows(Integer.MAX_VALUE); } else { // set ranking if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) { // set a most-recent ordering - params.setSortField(YaCySchema.last_modified.name(), ORDER.desc); + params.setSortField(YaCySchema.last_modified.getSolrFieldName(), ORDER.desc); } } @@ -574,10 +596,10 @@ public final class QueryParams { wc = 0; Float boost; for (YaCySchema field: fields) { - if (configuration != null && !configuration.contains(field.name())) continue; + if (configuration != null && !configuration.contains(field.getSolrFieldName())) continue; if (wc > 0) q.append(" OR "); q.append('('); - q.append(field.name()).append(':').append(w); + q.append(field.getSolrFieldName()).append(':').append(w); boost = boosts.get(field); if (boost != null) q.append('^').append(boost.toString()); q.append(')'); @@ -587,7 +609,7 @@ public final class QueryParams { q.append(')'); // add filter to prevent that results come from failed urls - q.append(" AND -").append(YaCySchema.failreason_t.name()).append(":[* TO *]"); + q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]"); return q; } @@ -665,7 +687,7 @@ public final class QueryParams { context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk); context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk); context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk); - context.append(this.sitehash).append(asterisk); + context.append(this.nav_sitehash).append(asterisk); context.append(this.siteexcludes).append(asterisk); context.append(this.authorhash).append(asterisk); context.append(this.targetlang).append(asterisk); @@ -694,9 +716,9 @@ public final class QueryParams { */ public static StringBuilder navurl( final String ext, final int page, final QueryParams theQuery, - final String newQueryString, final String originalUrlMask, final String nav) { + final String newQueryString, final String originalUrlMask) { - final StringBuilder sb = navurlBase(ext, theQuery, newQueryString, originalUrlMask, nav); + final StringBuilder sb = navurlBase(ext, theQuery, newQueryString, originalUrlMask); sb.append(ampersand); sb.append("startRecord="); @@ -707,7 +729,7 @@ public final class QueryParams { public static StringBuilder navurlBase( final String ext, final QueryParams theQuery, - final String newQueryString, final String originalUrlMask, final String nav) { + final String newQueryString, final String originalUrlMask) { final StringBuilder sb = new StringBuilder(120); sb.append("/yacysearch."); @@ -727,10 +749,6 @@ public final class QueryParams { sb.append("verify="); sb.append(theQuery.snippetCacheStrategy == null ? "false" : theQuery.snippetCacheStrategy.toName()); - sb.append(ampersand); - sb.append("nav="); - sb.append(nav); - sb.append(ampersand); sb.append("urlmaskfilter="); sb.append(originalUrlMask); diff --git a/source/net/yacy/search/query/RankingProcess.java b/source/net/yacy/search/query/RankingProcess.java index 06a0b09f9..a8917c726 100644 --- a/source/net/yacy/search/query/RankingProcess.java +++ b/source/net/yacy/search/query/RankingProcess.java @@ -89,8 +89,9 @@ public final class RankingProcess extends Thread { protected final AtomicInteger receivedRemoteReferences; protected final ReferenceOrder order; protected final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) - protected final ScoreMap hostNavigator; // a counter for the appearance of the host hash - protected final Map hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash + protected final ScoreMap hostNavigator = new ConcurrentScoreMap(); // a counter for the appearance of host names + protected final ScoreMap hostHashNavigator; // a counter for the appearance of the host hash (this can be filled during classic remote search) + protected final Map hostHashResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash protected final Map taggingPredicates; // a map from tagging vocabulary names to tagging predicate uris protected final Map> vocabularyNavigator; // counters for Vocabularies; key is metatag.getVocabularyName() private boolean remote; @@ -117,8 +118,8 @@ public final class RankingProcess extends Thread { this.receivedRemoteReferences = new AtomicInteger(0); this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); this.urlhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); - this.hostNavigator = new ConcurrentScoreMap(); - this.hostResolver = new ConcurrentHashMap(); + this.hostHashNavigator = new ConcurrentScoreMap(); + this.hostHashResolver = new ConcurrentHashMap(); this.vocabularyNavigator = new ConcurrentHashMap>(); this.taggingPredicates = new HashMap(); for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { @@ -284,7 +285,6 @@ public final class RankingProcess extends Thread { // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0; // apply all constraints long timeout = System.currentTimeMillis() + maxtime; @@ -336,18 +336,16 @@ public final class RankingProcess extends Thread { // check site constraints final String hosthash = iEntry.hosthash(); - if ( this.query.sitehash == null ) { + if ( this.query.nav_sitehash == null ) { if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) continue pollloop; } else { // filter out all domains that do not match with the site constraint - if (!hosthash.equals(this.query.sitehash)) continue pollloop; + if (!hosthash.equals(this.query.nav_sitehash)) continue pollloop; } // collect host navigation information (even if we have only one; this is to provide a switch-off button) - if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) { - this.hostNavigator.inc(hosthash); - this.hostResolver.put(hosthash, iEntry.urlhash()); - } + this.hostHashNavigator.inc(hosthash); + this.hostHashResolver.put(hosthash, iEntry.urlhash()); // check protocol if (!this.query.urlMask_isCatchall) { @@ -420,31 +418,37 @@ public final class RankingProcess extends Thread { return this.localSearchInclusion; } - public ScoreMap getHostNavigator() { final ScoreMap result = new ConcurrentScoreMap(); - if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts", 0) < 0 ) { - return result; - } - final Iterator domhashs = this.hostNavigator.keys(false); + final Iterator domhashs = this.hostHashNavigator.keys(false); URIMetadataNode row; byte[] urlhash; String hosthash, hostname; - if ( this.hostResolver != null ) { + if ( this.hostHashResolver != null ) { while ( domhashs.hasNext() && result.sizeSmaller(30) ) { hosthash = domhashs.next(); if ( hosthash == null ) { continue; } - urlhash = this.hostResolver.get(hosthash); + urlhash = this.hostHashResolver.get(hosthash); row = urlhash == null ? null : this.query.getSegment().fulltext().getMetadata(urlhash); hostname = row == null ? null : row.url().getHost(); if ( hostname != null ) { - result.set(hostname, this.hostNavigator.get(hosthash)); + result.set(hostname, this.hostHashNavigator.get(hosthash)); } } } + + // add only navigation hosts which have more than one entry + Iterator i = this.hostNavigator.keys(false); + while (i.hasNext()) { + String h = i.next(); + int c = this.hostNavigator.get(h); + if (c <= 0) break; + result.inc(h, c); + } + return result; } @@ -452,13 +456,10 @@ public final class RankingProcess extends Thread { return this.vocabularyNavigator; } - protected ScoreMap getTopicNavigator(final int count) { + public ScoreMap getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls final ScoreMap result = new ConcurrentScoreMap(); - if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("topics", 0) < 0 ) { - return result; - } if ( this.ref.sizeSmaller(2) ) { this.ref.clear(); // navigators with one entry are not useful } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 7d6d6153d..19ee51173 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import com.hp.hpl.jena.rdf.model.RDFNode; @@ -41,6 +42,7 @@ import com.hp.hpl.jena.rdf.model.Resource; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.Distribution; import net.yacy.cora.lod.JenaTripleStore; @@ -48,8 +50,8 @@ import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Scanner; -import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ConcurrentScoreMap; +import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element; @@ -105,10 +107,10 @@ public final class SearchEvent { private byte[] IAmaxcounthash, IAneardhthash; private final Thread localsearch; private final AtomicInteger expectedRemoteReferences, maxExpectedRemoteReferences; // counter for referenced that had been sorted out for other reasons - private final ScoreMap authorNavigator; // a counter for the appearances of authors - private final ScoreMap namespaceNavigator; // a counter for name spaces - private final ScoreMap protocolNavigator; // a counter for protocol types - private final ScoreMap filetypeNavigator; // a counter for file types + public final ScoreMap authorNavigator; // a counter for the appearances of authors + public final ScoreMap namespaceNavigator; // a counter for name spaces + public final ScoreMap protocolNavigator; // a counter for protocol types + public final ScoreMap filetypeNavigator; // a counter for file types protected final WeakPriorityBlockingQueue nodeStack; protected final WeakPriorityBlockingQueue result; protected final LoaderDispatcher loader; @@ -117,6 +119,7 @@ public final class SearchEvent { private SnippetWorker[] workerThreads; protected long urlRetrievalAllTime; protected long snippetComputationAllTime; + protected ConcurrentHashMap snippets; private final boolean remote; private boolean cleanupState; @@ -146,7 +149,8 @@ public final class SearchEvent { this.namespaceNavigator = new ConcurrentScoreMap(); this.protocolNavigator = new ConcurrentScoreMap(); this.filetypeNavigator = new ConcurrentScoreMap(); - + this.snippets = new ConcurrentHashMap(); + this.secondarySearchSuperviser = (this.query.query_include_hashes.size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches if ( this.secondarySearchSuperviser != null ) { @@ -393,53 +397,6 @@ public final class SearchEvent { return this.secondarySearchThreads; } - public RankingProcess getRankingResult() { - return this.rankingProcess; - } - - public ScoreMap getHostNavigator() { - return this.rankingProcess.getHostNavigator(); - } - - public ScoreMap getTopicNavigator(final int count) { - // returns a set of words that are computed as toplist - return this.rankingProcess.getTopicNavigator(count); - } - - public ScoreMap getNamespaceNavigator() { - if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace", 0) < 0 ) { - return new ClusteredScoreMap(); - } - return this.namespaceNavigator; - } - - public ScoreMap getProtocolNavigator() { - if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol", 0) < 0 ) { - return new ClusteredScoreMap(); - } - return this.protocolNavigator; - } - - public ScoreMap getFiletypeNavigator() { - if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype", 0) < 0 ) { - return new ClusteredScoreMap(); - } - return this.filetypeNavigator; - } - - public ScoreMap getAuthorNavigator() { - // create a list of words that had been computed by statistics over all - // words that appeared in the url or the description of all urls - if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("authors", 0) < 0 ) { - return new ConcurrentScoreMap(); - } - return this.authorNavigator; - } - - public Map> getVocabularyNavigators() { - return this.rankingProcess.getVocabularyNavigators(); - } - public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) { synchronized ( this.heuristics ) { this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant)); @@ -452,7 +409,6 @@ public final class SearchEvent { } } - protected boolean workerAlive() { if ( this.workerThreads == null ) { return false; @@ -467,11 +423,14 @@ public final class SearchEvent { public void add( final List index, + final Map> facets, // a map from a field name to scored values + final Map solrsnippets, // a map from urlhash to snippet text final boolean local, final String resourceName, final int fullResource) { this.rankingProcess.addBegin(); + this.snippets.putAll(solrsnippets); assert (index != null); if (index.isEmpty()) return; @@ -494,8 +453,20 @@ public final class SearchEvent { // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0; + // collect navigation information + ReversibleScoreMap fcts = facets.get(YaCySchema.host_s.getSolrFieldName()); + if (fcts != null) this.rankingProcess.hostNavigator.inc(fcts); + + fcts = facets.get(YaCySchema.url_file_ext_s.getSolrFieldName()); + if (fcts != null) this.filetypeNavigator.inc(fcts); + + fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName()); + if (fcts != null) this.protocolNavigator.inc(fcts); + + //fcts = facets.get(YaCySchema.author.getSolrFieldName()); + //if (fcts != null) this.authorNavigator.inc(fcts); + // apply all constraints try { final String pattern = this.query.urlMask.pattern(); @@ -535,19 +506,13 @@ public final class SearchEvent { // check site constraints final String hosthash = iEntry.hosthash(); - if ( this.query.sitehash == null ) { + if ( this.query.nav_sitehash == null ) { if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) { continue pollloop; } } else { // filter out all domains that do not match with the site constraint - if (!hosthash.equals(this.query.sitehash)) continue pollloop; - } - - // collect host navigation information (even if we have only one; this is to provide a switch-off button) - if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) { - this.rankingProcess.hostNavigator.inc(hosthash); - this.rankingProcess.hostResolver.put(hosthash, iEntry.hash()); + if (!hosthash.equals(this.query.nav_sitehash)) continue pollloop; } // check protocol @@ -874,13 +839,6 @@ public final class SearchEvent { // from here: collect navigation information - // collect host navigation information (even if we have only one; this is to provide a switch-off button) - if (!this.query.navigators.isEmpty() && (this.query.urlMask_isCatchall || this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0)) { - final String hosthash = page.hosthash(); - this.rankingProcess.hostNavigator.inc(hosthash); - this.rankingProcess.hostResolver.put(hosthash, page.hash()); - } - // namespace navigation String pagepath = page.url().getPath(); if ( (p = pagepath.indexOf(':')) >= 0 ) { @@ -896,10 +854,6 @@ public final class SearchEvent { final String protocol = page.url().getProtocol(); this.protocolNavigator.inc(protocol); - // file type navigation - final String fileext = page.url().getFileExtension(); - if ( fileext.length() > 0 ) this.filetypeNavigator.inc(fileext); - return page; // accept url } Log.logWarning("RWIProcess", "loop terminated"); diff --git a/source/net/yacy/search/query/SnippetWorker.java b/source/net/yacy/search/query/SnippetWorker.java index 2b9b9e5e0..3cfde7b74 100644 --- a/source/net/yacy/search/query/SnippetWorker.java +++ b/source/net/yacy/search/query/SnippetWorker.java @@ -22,9 +22,11 @@ package net.yacy.search.query; import java.util.Iterator; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.cora.storage.HandleSet; @@ -36,6 +38,7 @@ import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.index.Segment; import net.yacy.search.snippet.ResultEntry; import net.yacy.search.snippet.TextSnippet; +import net.yacy.search.snippet.TextSnippet.ResultClass; public class SnippetWorker extends Thread { private final SearchEvent snippetProcess; @@ -60,8 +63,6 @@ public class SnippetWorker extends Thread { // start fetching urls and snippets URIMetadataNode page; ResultEntry resultEntry; - //final int fetchAhead = snippetMode == 0 ? 0 : 10; - final boolean nav_topics = this.snippetProcess.query.navigators.equals("all") || this.snippetProcess.query.navigators.indexOf("topics",0) >= 0; try { while (this.shallrun && System.currentTimeMillis() < this.timeout) { this.lastLifeSign = System.currentTimeMillis(); @@ -97,27 +98,21 @@ public class SnippetWorker extends Thread { continue; } - // in case that we have an attached solr, we load also the solr document - String solrContent = page.getText(); - - resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 + resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 if (resultEntry == null) { continue; // the entry had some problems, cannot be used } //if (result.contains(resultEntry)) continue; - this.snippetProcess.urlRetrievalAllTime += resultEntry.dbRetrievalTime; this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime; // place the result to the result vector // apply post-ranking long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word())); - ranking += postRanking(resultEntry, this.snippetProcess.rankingProcess.getTopicNavigator(10)); + ranking += postRanking(resultEntry, new ConcurrentScoreMap() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/); resultEntry.ranking = ranking; this.snippetProcess.result.put(new ReverseElement(resultEntry, ranking)); // remove smallest in case of overflow - if (nav_topics) { - this.snippetProcess.rankingProcess.addTopics(resultEntry); - } + this.snippetProcess.rankingProcess.addTopics(resultEntry); } if (System.currentTimeMillis() >= this.timeout) { Log.logWarning("SnippetProcess", "worker ended with timeout"); @@ -199,7 +194,7 @@ public class SnippetWorker extends Thread { return r; } - private ResultEntry fetchSnippet(final URIMetadataNode page, final String solrText, final CacheStrategy cacheStrategy) { + private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) { // Snippet Fetching can has 3 modes: // 0 - do not fetch snippets // 1 - fetch snippets offline only @@ -208,16 +203,15 @@ public class SnippetWorker extends Thread { // load only urls if there was not yet a root url of that hash // find the url entry - long startTime = System.currentTimeMillis(); - if (page == null) { - return null; + String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash())); + if (solrsnippet != null && solrsnippet.length() > 0) { + final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, ""); + return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); } - final long dbRetrievalTime = System.currentTimeMillis() - startTime; - + if (cacheStrategy == null) { final TextSnippet snippet = new TextSnippet( null, - solrText, page, this.snippetProcess.snippetFetchWordHashes, //this.query.queryString, @@ -225,16 +219,15 @@ public class SnippetWorker extends Thread { ((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))), SearchEvent.SNIPPET_MAX_LENGTH, !this.snippetProcess.query.isLocal()); - return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, dbRetrievalTime, 0); // result without snippet + return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet } // load snippet if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) { // attach text snippet - startTime = System.currentTimeMillis(); + long startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet( this.snippetProcess.loader, - solrText, page, this.snippetProcess.snippetFetchWordHashes, cacheStrategy, @@ -246,16 +239,16 @@ public class SnippetWorker extends Thread { if (!snippet.getErrorCode().fail()) { // we loaded the file and found the snippet - return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached + return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached } else if (cacheStrategy.mustBeOffline()) { // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result // this may happen during a remote search, because snippet loading is omitted to retrieve results faster - return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet + return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet } else { // problems with snippet fetch if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) { // we accept that because the word cannot be on the page - return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, 0); + return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); } final String reason = "no text snippet; errorCode = " + snippet.getErrorCode(); if (this.snippetProcess.deleteIfSnippetFail) { @@ -265,6 +258,6 @@ public class SnippetWorker extends Thread { return null; } } - return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, 0); // result without snippet + return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet } } diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index b38be9827..2388dc095 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -60,21 +60,20 @@ public class ResultEntry implements Comparable, Comparator mediaSnippets, - final long dbRetrievalTime, final long snippetComputationTime) { + final long snippetComputationTime) { this.urlentry = urlentry; this.indexSegment = indexSegment; this.alternative_urlstring = null; this.alternative_urlname = null; this.textSnippet = textSnippet; this.mediaSnippets = mediaSnippets; - this.dbRetrievalTime = dbRetrievalTime; this.snippetComputationTime = snippetComputationTime; final String host = urlentry.url().getHost(); if (host != null && host.endsWith(".yacyh")) { diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 5e119f06b..8c1899941 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -132,20 +132,21 @@ public class TextSnippet implements Comparable, Comparator, Comparator, Comparator, Comparator, Comparator, Comparator, Comparator 0 ? textline : this.line, ResultClass.SOURCE_METADATA, null); + init(url.hash(), textline.length() > 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null); return; } @@ -262,12 +265,12 @@ public class TextSnippet implements Comparable, Comparator, Comparator, Comparator, Comparator, Comparator" + textline; if (snippetLine == null || !remainingHashes.isEmpty()) { - init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found"); + init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found"); return; } if (snippetLine.length() > snippetMaxLength) snippetLine = snippetLine.substring(0, snippetMaxLength); @@ -333,16 +336,18 @@ public class TextSnippet implements Comparable, Comparator, Comparator