// QueryParams.java // ----------------------- // part of YACY // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2005 // Created: 10.10.2005 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.search.query; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.commons.lang.StringUtils; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.RegExp; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.common.params.DisMaxParams; import org.apache.solr.common.params.FacetParams; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.LibraryProvider; import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.SetTools; import net.yacy.peers.Seed; import net.yacy.search.index.Segment; import net.yacy.search.ranking.RankingProfile; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; public final class QueryParams { /** The default max count of item lines in navigator */ public static final int FACETS_STANDARD_MAXCOUNT_DEFAULT = 100; /** The default maximum number of date elements in the date navigator */ public static final int FACETS_DATE_MAXCOUNT_DEFAULT = 640; public enum Searchdom { LOCAL, CLUSTER, GLOBAL; @Override public String toString() { if (this == LOCAL) return "local"; else if (this == CLUSTER) return "global"; // yes thats right: global, not cluster because a cluster search is a global search else if (this == GLOBAL) return "global"; return "local"; } } private static final Map defaultfacetfields = new HashMap(); static { // the key shall match with configuration property search.navigation // defaultfacetfields.put("location", CollectionSchema.coordinate_p_0_coordinate); // coordinate_p can't be used for facet (subfields), as value isn't used subfield can be used defaultfacetfields.put("hosts", CollectionSchema.host_s); defaultfacetfields.put("protocol", CollectionSchema.url_protocol_s); defaultfacetfields.put("filetype", CollectionSchema.url_file_ext_s); defaultfacetfields.put("date", CollectionSchema.dates_in_content_dts); defaultfacetfields.put("authors", CollectionSchema.author_sxt); defaultfacetfields.put("collections", CollectionSchema.collection_sxt); defaultfacetfields.put("language", CollectionSchema.language_s); //missing: namespace } public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA"); public static final Pattern catchall_pattern = Pattern.compile(".*"); private final QueryGoal queryGoal; public int itemsPerPage; public int offset; /** The URL mask pattern compiled from the urlMasString. * Null when the urlMaskString is not user provided but generated from the query modifiers */ public Pattern urlMaskPattern; public Automaton urlMaskAutomaton; public String urlMaskString; public final Pattern prefer; public final String tld, inlink; /** true when the urlMasString is just a catch all pattern such as ".*" */ boolean urlMask_isCatchall; /** Content-Type classification of expected results */ public final Classification.ContentDomain contentdom; /** *

When false, results can be extended to documents including links to documents * of {@link #contentdom} type, whithout being themselves of that type.

* Examples : * */ private boolean strictContentDom = false; public final String targetlang; protected final Collection metatags; public final Searchdom domType; private final int zonecode; public final int maxDistance; public final Bitfield constraint; public final boolean allofconstraint; protected CacheStrategy snippetCacheStrategy; public final RankingProfile ranking; private final Segment indexSegment; public final String clienthost; // this is the client host that starts the query, not a site operator protected final Set siteexcludes; // set of domain hashes that are excluded if not included by sitehash public final QueryModifier modifier; public Seed remotepeer; public final long starttime; // the time when the query started, how long it should take and the time when the timeout is reached (milliseconds) protected final long maxtime; // values that are set after a search: public int transmitcount; // number of results that had been shown to the user public long searchtime, urlretrievaltime, snippetcomputationtime; // time to perform the search, to get all the urls, and to compute the snippets public final String userAgent; protected double lat, lon, radius; public LinkedHashSet facetfields; private SolrQuery cachedQuery; private CollectionConfiguration solrSchema; public final int timezoneOffset; /** The max count of item lines in navigator */ private int standardFacetsMaxCount; /** The maximum number of date elements in the date navigator */ private int dateFacetMaxCount; public QueryParams( final QueryGoal queryGoal, final QueryModifier modifier, final int maxDistance, final String prefer, final ContentDomain contentdom, final String language, final int timezoneOffset, final Collection metatags, final CacheStrategy snippetCacheStrategy, final int itemsPerPage, final int offset, final String urlMask, final String tld, final String inlink, final Searchdom domType, final Bitfield constraint, final boolean allofconstraint, final Set siteexcludes, final int domainzone, final String host, final boolean specialRights, final Segment indexSegment, final RankingProfile ranking, final String userAgent, final double lat, final double lon, final double radius, final String[] search_navigation ) { this.queryGoal = queryGoal; this.modifier = modifier; this.ranking = ranking; this.maxDistance = maxDistance; this.contentdom = contentdom; this.timezoneOffset = timezoneOffset; this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage); if(domType == Searchdom.LOCAL) { /* No offset restriction on local index only requests, as only itemsPerPage will be loaded */ this.offset = Math.max(0, offset); } else { /* Offset has to be limited on requests mixing local and remote results, because all results before offset are loaded */ this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset)); } try { this.urlMaskString = urlMask; // solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?"; int p; while ((p = this.urlMaskString.indexOf(':')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1); while ((p = this.urlMaskString.indexOf('/')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1); while ((p = this.urlMaskString.indexOf('\\')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 2); this.urlMaskAutomaton = Automata.makeString(this.urlMaskString); this.urlMaskPattern = Pattern.compile(this.urlMaskString); } catch (final Throwable ex) { throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex); } this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString()); if (this.urlMask_isCatchall) { final String filter = QueryParams.buildApproximateURLFilter(modifier, tld); if (!QueryParams.catchall_pattern.toString().equals(filter)) { this.urlMaskString = filter; this.urlMaskAutomaton = Automata.makeString(filter); this.urlMask_isCatchall = false; /* We let here the urlMaskPattern null : * final URL match checking will be made with the more accurate matchesURL function */ this.urlMaskPattern = null; } } this.tld = tld; this.inlink = inlink; try { this.prefer = Pattern.compile(prefer); } catch (final PatternSyntaxException ex) { throw new IllegalArgumentException("Not a valid regular expression: " + prefer, ex); } assert language != null; this.targetlang = language; this.metatags = metatags; this.domType = domType; this.zonecode = domainzone; this.constraint = constraint; this.allofconstraint = allofconstraint; this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null: siteexcludes; this.snippetCacheStrategy = snippetCacheStrategy; this.clienthost = host; this.remotepeer = null; this.starttime = Long.valueOf(System.currentTimeMillis()); this.maxtime = 10000; this.indexSegment = indexSegment; this.userAgent = userAgent; this.transmitcount = 0; // we normalize here the location and radius because that should cause a better caching // and as surplus it will increase privacy this.lat = Math.floor(lat * this.kmNormal) / this.kmNormal; this.lon = Math.floor(lon * this.kmNormal) / this.kmNormal; this.radius = Math.floor(radius * this.kmNormal + 1) / this.kmNormal; this.facetfields = new LinkedHashSet(); this.solrSchema = indexSegment.fulltext().getDefaultConfiguration(); for (String navkey: search_navigation) { CollectionSchema f = defaultfacetfields.get(navkey); // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield)) // dto. for coordinate_p_0_coordinate is not enabled but used for location facet (because coordinate_p not valid for facet field) if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt") || f.name().equals("coordinate_p_0_coordinate") )) this.facetfields.add(f.getSolrFieldName()); } if (LibraryProvider.autotagging != null) for (Tagging v: LibraryProvider.autotagging.getVocabularies()) { if (v.isFacet()) { this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); } } for (String context: ProbabilisticClassifier.getContextNames()) { this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX); } this.cachedQuery = null; this.standardFacetsMaxCount = FACETS_STANDARD_MAXCOUNT_DEFAULT; this.dateFacetMaxCount = FACETS_DATE_MAXCOUNT_DEFAULT; } /** * Generate an URL filter from the query modifier and eventual tld, usable as a * first approximation for filtering, and compatible with the yacy/search * API.
* For truly accurate filtering, checking constraints against parsed URLs in * MultiprotocolURL instances is easier and more reliable than building a complex regular * expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}. * * @param modifier * query modifier with eventual protocol, sitehost and filetype * constraints. The modifier parameter itselft must not be null. * @param tld * an eventual Top Level Domain name * @return an URL filter regular expression from the provided modifier and tld * constraints, matching anything when there are no constraints at all. */ protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) { final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol; final String defaulthostprefix = "www"; final String hostfilter; if(modifier.sitehost == null && tld == null) { hostfilter = ".*"; } else if(modifier.sitehost == null) { hostfilter = ".*\\." + tld; } else if(modifier.sitehost.startsWith(defaulthostprefix + ".")){ hostfilter = "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4); } else { hostfilter = "(" + defaulthostprefix + "\\.)?" + modifier.sitehost; } final String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; // TODO: should be ".ext" but while/comment above suggests not -> add filetype contrain pullOneFilteredFromRWI() String filter = protocolfilter + "..." + hostfilter + "." + filefilter; if (!filter.equals(".*....*..*")) { /* Remove redundant sequences of catch all expressions */ Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*"); Matcher m; while ((m = r.matcher(filter)).find()) { filter = m.replaceAll(".*"); } } else { filter = QueryParams.catchall_pattern.toString(); } return filter; } private double kmNormal = 100.d; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid public Segment getSegment() { return this.indexSegment; } public int neededResults() { // the number of result lines that must be computed return this.offset + this.itemsPerPage; } public int itemsPerPage() { // the number of result lines that are displayed at once (size of result page) return this.itemsPerPage; } public void setOffset(final int newOffset) { this.offset = newOffset; } public boolean isLocal() { return this.domType == Searchdom.LOCAL; } /** * @return the max count of item lines in standard navigators */ public int getStandardFacetsMaxCount() { return this.standardFacetsMaxCount; } /** * @param standardFacetsMaxCount the max count of item lines in standard navigators */ public void setStandardFacetsMaxCount(final int standardFacetsMaxCount) { this.standardFacetsMaxCount = standardFacetsMaxCount; } /** * @return the maximum number of date elements in the date navigator */ public int getDateFacetMaxCount() { return this.dateFacetMaxCount; } /** * @param dateFacetMaxCount the maximum number of date elements in the date navigator */ public void setDateFacetMaxCount(final int dateFacetMaxCount) { this.dateFacetMaxCount = dateFacetMaxCount; } /** * @return false when results can be extended to documents including links to documents ot contentdom type. */ public boolean isStrictContentDom() { return this.strictContentDom; } /** * @param strictContentDom when false, results can be extended to documents including links to documents ot contentdom type. */ public void setStrictContentDom(final boolean strictContentDom) { this.strictContentDom = strictContentDom; } public static HandleSet hashes2Set(final String query) { final HandleSet keyhashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); if (query != null) { for (int i = 0; i < (query.length() / Word.commonHashLength); i++) try { keyhashes.put(ASCII.getBytes(query.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength))); } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); } } return keyhashes; } public static String hashSet2hashString(final HandleSet hashes) { final byte[] bb = new byte[hashes.size() * Word.commonHashLength]; int p = 0; for (final byte[] b : hashes) { assert b.length == Word.commonHashLength : "hash = " + ASCII.String(b); System.arraycopy(b, 0, bb, p, Word.commonHashLength); p += Word.commonHashLength; } return ASCII.String(bb); } public static String hashSet2hashString(final Set hashes) { final byte[] bb = new byte[hashes.size() * Word.commonHashLength]; int p = 0; for (final String s : hashes) { assert s.length() == Word.commonHashLength : "hash = " + s; System.arraycopy(ASCII.getBytes(s), 0, bb, p, Word.commonHashLength); p += Word.commonHashLength; } return ASCII.String(bb); } public static String anonymizedQueryHashes(final HandleSet hashes) { // create a more anonymized representation of a query hashes for logging final Iterator i = hashes.iterator(); final StringBuilder sb = new StringBuilder(hashes.size() * (Word.commonHashLength + 2) + 2); sb.append("["); byte[] hash; if (i.hasNext()) { hash = i.next(); sb.append(ASCII.String(hash).substring(0, 3)).append("........."); } while (i.hasNext()) { hash = i.next(); sb.append(", ").append(ASCII.String(hash).substring(0, 3)).append("........."); } sb.append("]"); return sb.toString(); } /** * Check wheter the given URL matches the eventual modifier and top-level domain * constraints. Should be preferred as more accurate than the url mask pattern generated with * {@link #buildApproximateURLFilter(QueryModifier, String)}. * * @param modifier * the query modifier with eventual constraints on protocoln, host * name or file extension * @param tld * an eventual top-level domain name to filter on * @param url * the url to check * @return the constraint that did not match ("url" when url is null, * "protocol", "sitehost", "tld", or "filetype"), or the empty string * when the url matches */ public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) { if (url == null) { return "url"; } if (modifier != null) { if (modifier.protocol != null) { if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) { return "protocol"; } } if (modifier.sitehost != null) { /* * consider to search for hosts with 'www'-prefix, if not already part of the * host name */ final String wwwPrefix = "www."; final String host; final String hostWithWwwPrefix; if (modifier.sitehost.startsWith(wwwPrefix)) { hostWithWwwPrefix = modifier.sitehost; host = modifier.sitehost.substring(wwwPrefix.length()); } else { hostWithWwwPrefix = wwwPrefix + modifier.sitehost; host = modifier.sitehost; } if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) { return "sitehost"; } } if (tld != null) { if (!tld.equalsIgnoreCase(url.getTLD())) { return "tld"; } } if (modifier.filetype != null) { if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) { return "filetype"; } } } return ""; } /** * check if the given text matches with the query * this checks inclusion and exclusion words * @param text * @return true if the query matches with the given text */ private final boolean matchesText(final String text) { boolean ret = false; QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Tokenizer.getWords(text, null).keySet()); if (!SetTools.anymatchByTest(this.queryGoal.getExcludeWords(), words)) { ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words); } return ret; } protected static final boolean anymatch(final String text, final Iterator keywords) { if (keywords == null || !keywords.hasNext()) return false; final SortedSet textwords = (SortedSet) Tokenizer.getWords(text, null).keySet(); return SetTools.anymatchByTest(keywords, textwords); } public SolrQuery solrQuery(final ContentDomain cd, final boolean strictContentDom, final boolean getFacets, final boolean excludeintext_image) { if (cd == ContentDomain.IMAGE) { return solrImageQuery(getFacets, strictContentDom); } final List filterQueries; switch (cd) { case AUDIO: filterQueries = this.queryGoal.collectionAudioFilterQuery(strictContentDom); break; case VIDEO: filterQueries = this.queryGoal.collectionVideoFilterQuery(strictContentDom); break; case APP: filterQueries = this.queryGoal.collectionApplicationFilterQuery(strictContentDom); break; default: filterQueries = this.queryGoal.collectionTextFilterQuery(excludeintext_image); break; } return solrQuery(getFacets, filterQueries); } /** * @param getFacets when true, generate facets for fiels given in this.facetfields * @param filterQueries a mutable list of filter queries, initialized with filters related to content domain. Must not be null. * @return a Solr query instance ready to use */ private SolrQuery solrQuery(final boolean getFacets, final List filterQueries) { if (this.cachedQuery != null) { this.cachedQuery.setStart(this.offset); if (!getFacets) this.cachedQuery.setFacet(false); return this.cachedQuery; } // construct query final SolrQuery params = getBasicParams(getFacets, filterQueries); int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0; params.setQuery(this.queryGoal.collectionTextQuery().toString()); Ranking actRanking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile String fq = actRanking.getFilterQuery(); String bq = actRanking.getBoostQuery(); String bf = actRanking.getBoostFunction(); final String qf = actRanking.getQueryFields(); if (!qf.isEmpty()) params.setParam(DisMaxParams.QF, qf); if (this.queryGoal.getIncludeSize() > 1) { // add boost on combined words if (bq.length() > 0) bq += "\n"; bq += CollectionSchema.text_t.getSolrFieldName() + ":\"" + this.queryGoal.getIncludeString() + "\"^10"; } if (fq.length() > 0) { String[] oldfq = params.getFilterQueries(); ArrayList newfq = new ArrayList<>(oldfq.length + 1); for (String x: oldfq) newfq.add(x); newfq.add(fq); params.setFilterQueries(newfq.toArray(new String[newfq.size()])); } if (bq.length() > 0) params.setParam(DisMaxParams.BQ, bq.split("[\\r\\n]+")); // split on any sequence consisting of CR and/or LF if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29 // prepare result ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString()); this.cachedQuery = params; return params; } private SolrQuery solrImageQuery(final boolean getFacets, final boolean strictContentDom) { if (this.cachedQuery != null) { this.cachedQuery.setStart(this.offset); if (!getFacets) this.cachedQuery.setFacet(false); return this.cachedQuery; } // construct query final SolrQuery params = getBasicParams(getFacets, this.queryGoal.collectionImageFilterQuery(strictContentDom)); params.setQuery(this.queryGoal.collectionImageQuery(this.modifier).toString()); if(!strictContentDom) { // set boosts StringBuilder bq = new StringBuilder(); bq.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\""); bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\""); bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\""); bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\""); params.setParam(DisMaxParams.BQ, bq.toString()); } // prepare result ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString()); this.cachedQuery = params; return params; } private SolrQuery getBasicParams(final boolean getFacets, final List fqs) { final SolrQuery params = new SolrQuery(); params.setParam("defType", "edismax"); params.setParam(DisMaxParams.QF, CollectionSchema.text_t.getSolrFieldName() + "^1.0"); params.setStart(this.offset); params.setRows(this.itemsPerPage); params.setFacet(false); if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) { // set a most-recent ordering params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc)); //params.setSortField(CollectionSchema.last_modified.getSolrFieldName(), ORDER.desc); // deprecated in Solr 4.2 } // add site facets fqs.addAll(getFacetsFilterQueries()); if (fqs.size() > 0) { params.setFilterQueries(fqs.toArray(new String[fqs.size()])); } // set facet query attributes if (getFacets && this.facetfields.size() > 0) { params.setFacet(true); params.setFacetMinCount(1); params.setFacetLimit(this.standardFacetsMaxCount); params.setFacetSort(FacetParams.FACET_SORT_COUNT); params.setParam(FacetParams.FACET_METHOD, FacetParams.FACET_METHOD_enum); // fight the fieldcache for (String field: this.facetfields) params.addFacetField("{!ex=" + field + "}" + field); // params.addFacetField("{!ex=" + field + "}" + field); if (this.facetfields.contains(CollectionSchema.dates_in_content_dts.name())) { params.setParam(FacetParams.FACET_RANGE, CollectionSchema.dates_in_content_dts.name()); String start = new Date(System.currentTimeMillis() - 1000L * 60L * 60L * 24L * 3).toInstant().toString(); String end = new Date(System.currentTimeMillis() + 1000L * 60L * 60L * 24L * 3).toInstant().toString(); params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.range.start", start); params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.range.end", end); params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.range.gap", "+1DAY"); params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.sort", "index"); params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.limit", Integer.toString(this.dateFacetMaxCount)); // the year constraint should cause that limitation already } //for (String k: params.getParameterNames()) {ArrayList al = new ArrayList<>(); for (String s: params.getParams(k)) al.add(s); System.out.println("Parameter: " + k + "=" + al.toString());} //http://localhost:8090/solr/collection1/select?q=*:*&rows=0&facet=true&facet.field=dates_in_content_dts&f.dates_in_content_dts.facet.limit=730&f.dates_in_content_dts.facet.sort=index } else { params.setFacet(false); } params.setFields("*", "score"); // we need the score for post-ranking return params; } long year = 1000L * 60L * 60L * 24L * 365L; private List getFacetsFilterQueries() { ArrayList fqs = new ArrayList<>(); // add site facets if (this.modifier.sitehash == null && this.modifier.sitehost == null) { if (this.siteexcludes != null) { for (String ex: this.siteexcludes) { fqs.add("-" + CollectionSchema.host_id_s.getSolrFieldName() + ':' + ex); } } } else { if (this.modifier.sitehost != null) { // consider to search for hosts with 'www'-prefix, if not already part of the host name if (this.modifier.sitehost.startsWith("www.")) { fqs.add(CollectionSchema.host_s.getSolrFieldName() + ":\"" + this.modifier.sitehost.substring(4) + "\" OR " + CollectionSchema.host_s.getSolrFieldName() + ":\"" + this.modifier.sitehost + "\""); } else { fqs.add(CollectionSchema.host_s.getSolrFieldName() + ":\"" + this.modifier.sitehost + "\" OR " + CollectionSchema.host_s.getSolrFieldName() + ":\"www." + this.modifier.sitehost + "\""); } } else fqs.add(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + this.modifier.sitehash + '\"'); } // add vocabulary facets if (this.metatags != null) { for (Tagging.Metatag tag : this.metatags) { fqs.add(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX + ":\"" + tag.getObject() + '\"'); } } // add language facet if (this.modifier.language != null && this.modifier.language.length() > 0 && this.solrSchema.contains((CollectionSchema.language_s))) { fqs.add(CollectionSchema.language_s.getSolrFieldName() + ":\"" + this.modifier.language + '\"'); } // add author facets (check for contains(author) as author_sxt is omitted copyfield) if (this.modifier.author != null && this.modifier.author.length() > 0 && this.solrSchema.contains(CollectionSchema.author)) { fqs.add(CollectionSchema.author_sxt.getSolrFieldName() + ":\"" + this.modifier.author + '\"'); } // add keyword filter if (this.modifier.keyword != null && this.modifier.keyword.length() > 0 && this.solrSchema.contains(CollectionSchema.keywords)) { fqs.add(CollectionSchema.keywords.getSolrFieldName() + ":\"" + this.modifier.keyword + '\"'); } // add collection facets if (this.modifier.collection != null && this.modifier.collection.length() > 0 && this.solrSchema.contains(CollectionSchema.collection_sxt)) { fqs.add(QueryModifier.parseCollectionExpression(this.modifier.collection)); } if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) { if (this.modifier.on != null && this.modifier.on.length() > 0) { fqs.add(QueryModifier.parseOnExpression(this.modifier.on, this.timezoneOffset)); } if (this.modifier.from != null && this.modifier.from.length() > 0 && (this.modifier.to == null || this.modifier.to.equals("*"))) { fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null, this.timezoneOffset)); } if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null && this.modifier.to.length() > 0) { fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to, this.timezoneOffset)); } if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null && this.modifier.to.length() > 0) { fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to, this.timezoneOffset)); } } if (this.modifier.protocol != null) { fqs.add("{!tag=" + CollectionSchema.url_protocol_s.getSolrFieldName() + "}" + CollectionSchema.url_protocol_s.getSolrFieldName() + ':' + this.modifier.protocol); } if (this.tld != null) { /* Use the host_s field which is mandatory, rather than the optional host_dnc_s field */ fqs.add(CollectionSchema.host_s.getSolrFieldName() + ":*." + this.tld); } if (this.modifier.filetype != null) { fqs.add(CollectionSchema.url_file_ext_s.getSolrFieldName() + ":\"" + this.modifier.filetype + '\"'); } if (this.inlink != null) { fqs.add(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() + ":\"" + this.inlink + '\"'); } if (!this.urlMask_isCatchall && this.urlMaskPattern != null) { // add a filter query on urls only if user custom and not generated from other modifiers fqs.add(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/"); } if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) { // localtion search, no special ranking // try http://localhost:8090/solr/select?q=*:*&fq={!bbox sfield=coordinate_p pt=50.17,8.65 d=1} //params.setQuery("!bbox " + q.toString()); //params.set("sfield", YaCySchema.coordinate_p.name()); //params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon)); //params.set("d", GeoLocation.degreeToKm(this.radius)); fqs.add("{!bbox sfield=" + CollectionSchema.coordinate_p.getSolrFieldName() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}"); //params.setRows(Integer.MAX_VALUE); } return fqs; } public QueryGoal getQueryGoal() { return this.queryGoal; } public final Map separateMatches(final Map links) { final Map matcher = new HashMap<>(); final Iterator > i = links.entrySet().iterator(); Map.Entry entry; AnchorURL url; String anchorText; while (i.hasNext()) { entry = i.next(); url = entry.getKey(); anchorText = entry.getValue(); if (matchesText(anchorText)) { matcher.put(url, anchorText); i.remove(); } } return matcher; } private volatile String idCacheAnon = null, idCache = null; final static private char asterisk = '*'; public String id(final boolean anonymized) { if (anonymized) { if (this.idCacheAnon != null) return this.idCacheAnon; } else { if (this.idCache != null) return this.idCache; } synchronized (this) { // do a Double-Checked Locking if (anonymized) { if (this.idCacheAnon != null) return this.idCacheAnon; } else { if (this.idCache != null) return this.idCache; } // generate a string that identifies a search so results can be re-used in a cache final StringBuilder context = new StringBuilder(180); if (anonymized) { context.append(anonymizedQueryHashes(this.queryGoal.getIncludeHashes())); context.append('-'); context.append(anonymizedQueryHashes(this.queryGoal.getExcludeHashes())); } else { context.append(hashSet2hashString(this.queryGoal.getIncludeHashes())); context.append('-'); context.append(hashSet2hashString(this.queryGoal.getExcludeHashes())); } //context.append(asterisk); //context.append(this.domType); context.append(asterisk); context.append(this.contentdom).append(asterisk); context.append(this.strictContentDom).append(asterisk); context.append(this.zonecode).append(asterisk); context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk); context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk); context.append(Base64Order.enhancedCoder.encodeString(this.urlMaskString)).append(asterisk); context.append(this.modifier.sitehash).append(asterisk); context.append(this.modifier.author).append(asterisk); context.append(this.modifier.protocol).append(asterisk); context.append(this.modifier.filetype).append(asterisk); context.append(this.modifier.collection).append(asterisk); context.append(this.modifier.toString()).append(asterisk); context.append(this.siteexcludes).append(asterisk); context.append(this.targetlang).append(asterisk); context.append(this.domType).append(asterisk); context.append(this.constraint).append(asterisk); context.append(this.maxDistance).append(asterisk); context.append(this.tld).append(asterisk); context.append(this.inlink).append(asterisk); context.append(this.lat).append(asterisk).append(this.lon).append(asterisk).append(this.radius).append(asterisk); context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name()); String result = context.toString(); if (anonymized) { this.idCacheAnon = result; } else { this.idCache = result; } return result; } } /** * Build a search query URL from the given parameters. * * @param ext extension of the servlet to request (e.g. "html", "rss", "json"...) * @param page index of the wanted page (first page is zero) * @param theQuery holds the main query parameters. Must not be null. * @param newModifier a eventual new modifier to append to the eventual ones already defined in theQuery QueryParams. Can be null. * @param newModifierReplacesOld when newModifier is not null, it is appended in addition * to existing modifier(s) - if it is empty it overwrites (clears) existing * modifier(s) * @param authenticatedFeatures * when true, access to authentication protected search features is * wanted * @return a StringBuilder instance with the URL to the new search result page */ public static StringBuilder navurl(final RequestHeader.FileType ext, final int page, final QueryParams theQuery, final String newModifier, boolean newModifierReplacesOld, final boolean authenticatedFeatures) { final StringBuilder sb = navurlBase(ext, theQuery, newModifier, newModifierReplacesOld, authenticatedFeatures); sb.append("&startRecord="); sb.append(page * theQuery.itemsPerPage()); return sb; } /** * Build a search query URL from the given parameters, removing only the given single query modifier. * * @param ext extension of the servlet to request (e.g. "html", "rss", "json"...) * @param page index of the wanted page (first page is zero) * @param theQuery holds the main query parameters. Must not be null. * @param modifierToRemove the query modifier to remove (e.g. "keyword:word", "/language/en", "site:example.org"...) * @param authenticatedFeatures * when true, access to authentication protected search features is * wanted * @return the URL to the new search result page */ public static String navUrlWithSingleModifierRemoved(final RequestHeader.FileType ext, final int page, final QueryParams theQuery, final String modifierToRemove, final boolean authenticatedFeatures) { final StringBuilder sb = new StringBuilder(120); sb.append("yacysearch."); sb.append(ext.name().toLowerCase(Locale.ROOT)); sb.append("?query="); sb.append(theQuery.getQueryGoal().getQueryString(true)); if (!theQuery.modifier.isEmpty()) { String modifierString = theQuery.modifier.toString(); if(StringUtils.isNotBlank(modifierToRemove)) { if(modifierString.startsWith(modifierToRemove)) { modifierString = modifierString.substring(modifierToRemove.length()); } else { modifierString = modifierString.replace(" " + modifierToRemove, ""); } } if(StringUtils.isNotBlank(modifierString)) { sb.append("+" + modifierString.trim()); } } appendNavUrlQueryParams(sb, theQuery, authenticatedFeatures); return sb.toString(); } /** * Build a search query URL with a new search query string, but keeping any already defined eventual modifiers. * * @param ext extension of the servlet to request (e.g. "html", "rss", "json"...) * @param page index of the wanted page (first page is zero) * @param theQuery holds the main query parameters. Must not be null. * @param authenticatedFeatures * when true, access to authentication protected search features is * wanted * @return the URL to the new search result page */ public static String navUrlWithNewQueryString(final RequestHeader.FileType ext, final int page, final QueryParams theQuery, final String newQueryString, final boolean authenticatedFeatures) { final StringBuilder sb = new StringBuilder(120); sb.append("yacysearch."); sb.append(ext.name().toLowerCase(Locale.ROOT)); sb.append("?query="); sb.append(new QueryGoal(newQueryString).getQueryString(true)); if (!theQuery.modifier.isEmpty()) { sb.append("+" + theQuery.modifier.toString()); } appendNavUrlQueryParams(sb, theQuery, authenticatedFeatures); return sb.toString(); } /** * construct navigator url * * @param ext * extension of servlet (e.g. html, rss) * @param theQuery * search query * @param newModifier optional new modifier. - if null existing modifier(s) of theQuery are * appended - if not null this new modifier is appended in addition * to eventually existing modifier(s) - if isEmpty overwrites (clears) any eventual existing * modifier(s) * @param newModifierReplacesOld considered only when newModifier is not null and not empty. When true, any existing modifiers with the same name are replaced with the new one. * @param authenticatedFeatures * when true, access to authentication protected search features is * wanted * @return url to new search result page */ public static StringBuilder navurlBase(final RequestHeader.FileType ext, final QueryParams theQuery, final String newModifier, final boolean newModifierReplacesOld, final boolean authenticatedFeatures) { final StringBuilder sb = new StringBuilder(120); sb.append("yacysearch."); sb.append(ext.name().toLowerCase(Locale.ROOT)); sb.append("?query="); sb.append(theQuery.getQueryGoal().getQueryString(true)); if (newModifier == null) { if (!theQuery.modifier.isEmpty()) { sb.append("+" + theQuery.modifier.toString()); } } else { if (!newModifier.isEmpty()) { if (!theQuery.modifier.isEmpty()) { sb.append("+" + theQuery.modifier.toString()); } if (newModifierReplacesOld) { removeOldModifiersFromNavUrl(sb, newModifier); } try { sb.append("+" + URLEncoder.encode(newModifier, StandardCharsets.UTF_8.name())); } catch (final UnsupportedEncodingException e) { sb.append("+" + newModifier); } } } appendNavUrlQueryParams(sb, theQuery, authenticatedFeatures); return sb; } /** * Append search query parameters to the URL builder already filled with the beginning of the URL. * * @param sb the URL string builder to fill. Must not be null. * @param theQuery holds the main query parameters. Must not be null. * @param authenticatedFeatures * when true, access to authentication protected search features is * wanted */ protected static void appendNavUrlQueryParams(final StringBuilder sb, final QueryParams theQuery, final boolean authenticatedFeatures) { sb.append("&maximumRecords="); sb.append(theQuery.itemsPerPage()); sb.append("&resource="); sb.append((theQuery.isLocal()) ? "local" : "global"); sb.append("&verify="); sb.append(theQuery.snippetCacheStrategy == null ? "false" : theQuery.snippetCacheStrategy.toName()); sb.append("&prefermaskfilter="); sb.append(theQuery.prefer); sb.append("&cat=href"); sb.append("&constraint="); sb.append((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()); sb.append("&contentdom="); sb.append(theQuery.contentdom.toString()); sb.append("&strictContentDom="); sb.append(String.valueOf(theQuery.isStrictContentDom())); sb.append("&former="); sb.append(theQuery.getQueryGoal().getQueryString(true)); if(authenticatedFeatures) { sb.append("&auth"); } } /** * Remove from the URL builder any query modifiers with the same name that the new modifier * @param sb * a StringBuilder holding the search URL navigation being built. * Must not be null and contain the URL base and the query string * with its eventual modifiers * @param newModifier * a new modifier of form key:value. Must not be null. */ protected static void removeOldModifiersFromNavUrl(final StringBuilder sb, final String newModifier) { int nmpi = newModifier.indexOf(":"); if (nmpi > 0) { final String newModifierKey = newModifier.substring(0, nmpi) + ":"; int sameModifierIndex = sb.indexOf(newModifierKey); while (sameModifierIndex > 0) { final int spaceModifierIndex = sb.indexOf(" ", sameModifierIndex); if(spaceModifierIndex > sameModifierIndex) { /* There are other modifiers after the matching one : we only remove the old matching modifier */ sb.delete(sameModifierIndex, spaceModifierIndex + 1); } else { /* The matching modifier is the last : we truncate the builder */ sb.setLength(sameModifierIndex); } sameModifierIndex = sb.indexOf(newModifierKey); } if (sb.charAt(sb.length() - 1) == '+') { sb.setLength(sb.length() - 1); } if (sb.charAt(sb.length() - 1) == ' ') { sb.setLength(sb.length() - 1); } } } }