You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
663 lines
30 KiB
663 lines
30 KiB
// QueryParams.java
|
|
// -----------------------
|
|
// part of YACY
|
|
// (C) by Michael Peter Christen; mc@yacy.net
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2005
|
|
// Created: 10.10.2005
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.search.query;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.regex.Pattern;
|
|
import java.util.regex.PatternSyntaxException;
|
|
|
|
import org.apache.solr.client.solrj.SolrQuery;
|
|
import org.apache.solr.client.solrj.SolrQuery.SortClause;
|
|
|
|
import net.yacy.cora.document.ASCII;
|
|
import net.yacy.cora.document.analysis.Classification;
|
|
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
|
import net.yacy.cora.federate.solr.Ranking;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.geo.GeoLocation;
|
|
import net.yacy.cora.lod.vocabulary.Tagging;
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.document.LibraryProvider;
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.data.word.WordReferenceRow;
|
|
import net.yacy.kelondro.index.RowHandleSet;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.util.Bitfield;
|
|
import net.yacy.kelondro.util.SetTools;
|
|
import net.yacy.peers.Seed;
|
|
import net.yacy.search.index.Segment;
|
|
import net.yacy.search.ranking.RankingProfile;
|
|
import net.yacy.search.schema.CollectionConfiguration;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
|
|
public final class QueryParams {
|
|
|
|
public enum Searchdom {
|
|
LOCAL, CLUSTER, GLOBAL;
|
|
|
|
@Override
|
|
public String toString() {
|
|
if (this == LOCAL) return "local";
|
|
else if (this == CLUSTER) return "global"; // yes thats right: global, not cluster because a cluster search is a global search
|
|
else if (this == GLOBAL) return "global";
|
|
return "local";
|
|
}
|
|
}
|
|
|
|
private static final CollectionSchema[] defaultfacetfields = new CollectionSchema[]{
|
|
CollectionSchema.host_s, CollectionSchema.url_protocol_s, CollectionSchema.url_file_ext_s, CollectionSchema.author_sxt};
|
|
|
|
private static final int defaultmaxfacets = 30;
|
|
private static final String ampersand = "&";
|
|
public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA");
|
|
public static final Pattern catchall_pattern = Pattern.compile(".*");
|
|
private static final Pattern matchnothing_pattern = Pattern.compile("");
|
|
|
|
private final QueryGoal queryGoal;
|
|
public int itemsPerPage;
|
|
public int offset;
|
|
public Pattern urlMask;
|
|
|
|
public final Pattern prefer;
|
|
public final String tld, inlink;
|
|
boolean urlMask_isCatchall;
|
|
public final Classification.ContentDomain contentdom;
|
|
public final String targetlang;
|
|
protected final Collection<Tagging.Metatag> metatags;
|
|
public final Searchdom domType;
|
|
private final int zonecode;
|
|
public final int maxDistance;
|
|
public final Bitfield constraint;
|
|
public final boolean allofconstraint;
|
|
protected CacheStrategy snippetCacheStrategy;
|
|
public final RankingProfile ranking;
|
|
private final Segment indexSegment;
|
|
public final String clienthost; // this is the client host that starts the query, not a site operator
|
|
protected final Set<String> siteexcludes; // set of domain hashes that are excluded if not included by sitehash
|
|
public final QueryModifier modifier;
|
|
public Seed remotepeer;
|
|
public final long starttime; // the time when the query started, how long it should take and the time when the timeout is reached (milliseconds)
|
|
protected final long maxtime;
|
|
private final long timeout;
|
|
// values that are set after a search:
|
|
public int transmitcount; // number of results that had been shown to the user
|
|
public long searchtime, urlretrievaltime, snippetcomputationtime; // time to perform the search, to get all the urls, and to compute the snippets
|
|
public final String userAgent;
|
|
protected boolean filterfailurls, filterscannerfail;
|
|
protected double lat, lon, radius;
|
|
public List<String> facetfields;
|
|
public int maxfacets;
|
|
private SolrQuery cachedQuery;
|
|
private CollectionConfiguration solrSchema;
|
|
|
|
public QueryParams(
|
|
final String query_original,
|
|
final String query_words,
|
|
final int itemsPerPage,
|
|
final Bitfield constraint,
|
|
final Segment indexSegment,
|
|
final RankingProfile ranking,
|
|
final String userAgent) {
|
|
this.queryGoal = new QueryGoal(query_original, query_words);
|
|
this.ranking = ranking;
|
|
this.modifier = new QueryModifier();
|
|
this.maxDistance = Integer.MAX_VALUE;
|
|
this.urlMask = catchall_pattern;
|
|
this.urlMask_isCatchall = true;
|
|
this.tld = null;
|
|
this.inlink = null;
|
|
this.prefer = matchnothing_pattern;
|
|
this.contentdom = ContentDomain.ALL;
|
|
this.itemsPerPage = itemsPerPage;
|
|
this.offset = 0;
|
|
this.targetlang = "en";
|
|
this.metatags = new ArrayList<Tagging.Metatag>(0);
|
|
this.domType = Searchdom.LOCAL;
|
|
this.zonecode = DigestURI.TLD_any_zone_filter;
|
|
this.constraint = constraint;
|
|
this.allofconstraint = false;
|
|
this.snippetCacheStrategy = null;
|
|
this.clienthost = null;
|
|
this.siteexcludes = null;
|
|
this.remotepeer = null;
|
|
this.starttime = Long.valueOf(System.currentTimeMillis());
|
|
this.maxtime = 10000;
|
|
this.timeout = this.starttime + this.timeout;
|
|
this.indexSegment = indexSegment;
|
|
this.userAgent = userAgent;
|
|
this.transmitcount = 0;
|
|
this.filterfailurls = false;
|
|
this.filterscannerfail = false;
|
|
this.lat = 0.0d;
|
|
this.lon = 0.0d;
|
|
this.radius = 0.0d;
|
|
this.facetfields = new ArrayList<String>();
|
|
|
|
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
|
|
for (CollectionSchema f: defaultfacetfields) {
|
|
if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
|
|
}
|
|
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
|
|
this.maxfacets = defaultmaxfacets;
|
|
this.cachedQuery = null;
|
|
}
|
|
|
|
public QueryParams(
|
|
final QueryGoal queryGoal,
|
|
final QueryModifier modifier,
|
|
final int maxDistance,
|
|
final String prefer,
|
|
final ContentDomain contentdom,
|
|
final String language,
|
|
final Collection<Tagging.Metatag> metatags,
|
|
final CacheStrategy snippetCacheStrategy,
|
|
final int itemsPerPage,
|
|
final int offset,
|
|
final String urlMask,
|
|
final String tld,
|
|
final String inlink,
|
|
final Searchdom domType,
|
|
final int domMaxTargets,
|
|
final Bitfield constraint,
|
|
final boolean allofconstraint,
|
|
final Set<String> siteexcludes,
|
|
final int domainzone,
|
|
final String host,
|
|
final boolean specialRights,
|
|
final Segment indexSegment,
|
|
final RankingProfile ranking,
|
|
final String userAgent,
|
|
final boolean filterfailurls,
|
|
final boolean filterscannerfail,
|
|
final double lat,
|
|
final double lon,
|
|
final double radius
|
|
) {
|
|
this.queryGoal = queryGoal;
|
|
this.modifier = modifier;
|
|
this.ranking = ranking;
|
|
this.maxDistance = maxDistance;
|
|
this.contentdom = contentdom;
|
|
this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage);
|
|
this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset));
|
|
try {
|
|
this.urlMask = Pattern.compile(urlMask.toLowerCase());
|
|
} catch (final PatternSyntaxException ex) {
|
|
throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex);
|
|
}
|
|
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
|
|
if (this.urlMask_isCatchall) {
|
|
if (modifier.protocol != null) {
|
|
this.urlMask = Pattern.compile(modifier.protocol + ".*");
|
|
this.urlMask_isCatchall = false;
|
|
}
|
|
if (tld != null) {
|
|
this.urlMask = Pattern.compile(".*" + tld + ".*");
|
|
this.urlMask_isCatchall = false;
|
|
}
|
|
if (modifier.filetype != null) {
|
|
this.urlMask = Pattern.compile(".*" + modifier.filetype + ".*");
|
|
this.urlMask_isCatchall = false;
|
|
}
|
|
}
|
|
this.tld = tld;
|
|
this.inlink = inlink;
|
|
try {
|
|
this.prefer = Pattern.compile(prefer);
|
|
} catch (final PatternSyntaxException ex) {
|
|
throw new IllegalArgumentException("Not a valid regular expression: " + prefer, ex);
|
|
}
|
|
this.prefer.toString().equals(matchnothing_pattern.toString());
|
|
assert language != null;
|
|
this.targetlang = language;
|
|
this.metatags = metatags;
|
|
this.domType = domType;
|
|
this.zonecode = domainzone;
|
|
this.constraint = constraint;
|
|
this.allofconstraint = allofconstraint;
|
|
this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null: siteexcludes;
|
|
this.snippetCacheStrategy = snippetCacheStrategy;
|
|
this.clienthost = host;
|
|
this.remotepeer = null;
|
|
this.starttime = Long.valueOf(System.currentTimeMillis());
|
|
this.maxtime = 10000;
|
|
this.timeout = this.starttime + this.timeout;
|
|
this.indexSegment = indexSegment;
|
|
this.userAgent = userAgent;
|
|
this.transmitcount = 0;
|
|
this.filterfailurls = filterfailurls;
|
|
this.filterscannerfail = filterscannerfail;
|
|
// we normalize here the location and radius because that should cause a better caching
|
|
// and as surplus it will increase privacy
|
|
this.lat = Math.floor(lat * this.kmNormal) / this.kmNormal;
|
|
this.lon = Math.floor(lon * this.kmNormal) / this.kmNormal;
|
|
this.radius = Math.floor(radius * this.kmNormal + 1) / this.kmNormal;
|
|
this.facetfields = new ArrayList<String>();
|
|
|
|
this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
|
|
for (CollectionSchema f: defaultfacetfields) {
|
|
if (solrSchema.contains(f)) facetfields.add(f.getSolrFieldName());
|
|
}
|
|
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
|
|
this.maxfacets = defaultmaxfacets;
|
|
this.cachedQuery = null;
|
|
}
|
|
|
|
private double kmNormal = 100.d; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid
|
|
|
|
public Segment getSegment() {
|
|
return this.indexSegment;
|
|
}
|
|
|
|
public int neededResults() {
|
|
// the number of result lines that must be computed
|
|
return this.offset + this.itemsPerPage;
|
|
}
|
|
|
|
public int itemsPerPage() {
|
|
// the number of result lines that are displayed at once (size of result page)
|
|
return this.itemsPerPage;
|
|
}
|
|
|
|
public void setOffset(final int newOffset) {
|
|
this.offset = newOffset;
|
|
}
|
|
|
|
public boolean isLocal() {
|
|
return this.domType == Searchdom.LOCAL;
|
|
}
|
|
|
|
public static HandleSet hashes2Set(final String query) {
|
|
final HandleSet keyhashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
|
|
if (query != null) {
|
|
for (int i = 0; i < (query.length() / Word.commonHashLength); i++) try {
|
|
keyhashes.put(ASCII.getBytes(query.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength)));
|
|
} catch (final SpaceExceededException e) {
|
|
Log.logException(e);
|
|
}
|
|
}
|
|
return keyhashes;
|
|
}
|
|
|
|
public static String hashSet2hashString(final HandleSet hashes) {
|
|
final byte[] bb = new byte[hashes.size() * Word.commonHashLength];
|
|
int p = 0;
|
|
for (final byte[] b : hashes) {
|
|
assert b.length == Word.commonHashLength : "hash = " + ASCII.String(b);
|
|
System.arraycopy(b, 0, bb, p, Word.commonHashLength);
|
|
p += Word.commonHashLength;
|
|
}
|
|
return ASCII.String(bb);
|
|
}
|
|
|
|
public static String hashSet2hashString(final Set<String> hashes) {
|
|
final byte[] bb = new byte[hashes.size() * Word.commonHashLength];
|
|
int p = 0;
|
|
for (final String s : hashes) {
|
|
assert s.length() == Word.commonHashLength : "hash = " + s;
|
|
System.arraycopy(ASCII.getBytes(s), 0, bb, p, Word.commonHashLength);
|
|
p += Word.commonHashLength;
|
|
}
|
|
return ASCII.String(bb);
|
|
}
|
|
|
|
public static String anonymizedQueryHashes(final HandleSet hashes) {
|
|
// create a more anonymized representation of a query hashes for logging
|
|
final Iterator<byte[]> i = hashes.iterator();
|
|
final StringBuilder sb = new StringBuilder(hashes.size() * (Word.commonHashLength + 2) + 2);
|
|
sb.append("[");
|
|
byte[] hash;
|
|
if (i.hasNext()) {
|
|
hash = i.next();
|
|
sb.append(ASCII.String(hash).substring(0, 3)).append(".........");
|
|
}
|
|
while (i.hasNext()) {
|
|
hash = i.next();
|
|
sb.append(", ").append(ASCII.String(hash).substring(0, 3)).append(".........");
|
|
}
|
|
sb.append("]");
|
|
return sb.toString();
|
|
}
|
|
|
|
/**
|
|
* check if the given text matches with the query
|
|
* this checks inclusion and exclusion words
|
|
* @param text
|
|
* @return true if the query matches with the given text
|
|
*/
|
|
private final boolean matchesText(final String text) {
|
|
boolean ret = false;
|
|
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
|
|
if (!SetTools.anymatch(wordhashes, this.queryGoal.getExcludeHashes())) {
|
|
ret = SetTools.totalInclusion(this.queryGoal.getIncludeHashes(), wordhashes);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
|
|
// returns true if any of the word hashes in keyhashes appear in the String text
|
|
// to do this, all words in the string must be recognized and transcoded to word hashes
|
|
if (keyhashes == null || keyhashes.isEmpty()) return false;
|
|
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
|
|
return SetTools.anymatch(wordhashes, keyhashes);
|
|
}
|
|
|
|
public SolrQuery solrQuery() {
|
|
if (this.cachedQuery != null) {
|
|
this.cachedQuery.setStart(this.offset);
|
|
return this.cachedQuery;
|
|
}
|
|
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
|
|
// construct query
|
|
final SolrQuery params = new SolrQuery();
|
|
int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0;
|
|
params.setQuery(this.queryGoal.collectionQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile).toString());
|
|
params.setParam("defType", "edismax");
|
|
Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile
|
|
|
|
String bq = ranking.getBoostQuery();
|
|
String bf = ranking.getBoostFunction();
|
|
if (bq.length() > 0) params.setParam("bq", bq);
|
|
if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29
|
|
params.setStart(this.offset);
|
|
params.setRows(this.itemsPerPage);
|
|
params.setFacet(false);
|
|
|
|
// add site facets
|
|
final StringBuilder fq = new StringBuilder();
|
|
if (this.modifier.sitehash == null && this.modifier.sitehost == null) {
|
|
if (this.siteexcludes != null) {
|
|
for (String ex: this.siteexcludes) {
|
|
fq.append(" AND -").append(CollectionSchema.host_id_s.getSolrFieldName()).append(':').append(ex);
|
|
}
|
|
}
|
|
} else {
|
|
if (this.modifier.sitehost != null) {
|
|
// consider to search for hosts with 'www'-prefix, if not already part of the host name
|
|
if (this.modifier.sitehost.startsWith("www.")) {
|
|
fq.append(" AND (").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(this.modifier.sitehost.substring(4)).append('\"');
|
|
fq.append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(this.modifier.sitehost).append("\")");
|
|
} else {
|
|
fq.append(" AND (").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(this.modifier.sitehost).append('\"');
|
|
fq.append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"www.").append(this.modifier.sitehost).append("\")");
|
|
}
|
|
} else
|
|
fq.append(" AND ").append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(this.modifier.sitehash).append('\"');
|
|
}
|
|
|
|
// add vocabulary facets
|
|
for (Tagging.Metatag tag: this.metatags) {
|
|
fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
|
|
}
|
|
|
|
// add author facets
|
|
if (this.modifier.author != null && this.modifier.author.length() > 0 && this.solrSchema.contains(CollectionSchema.author_sxt)) {
|
|
fq.append(" AND ").append(CollectionSchema.author_sxt.getSolrFieldName()).append(":\"").append(this.modifier.author).append('\"');
|
|
}
|
|
|
|
if (this.modifier.protocol != null) {
|
|
fq.append(" AND ").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(':').append(this.modifier.protocol);
|
|
}
|
|
|
|
if (this.tld != null) {
|
|
fq.append(" AND ").append(CollectionSchema.host_dnc_s.getSolrFieldName()).append(":\"").append(this.tld).append('\"');
|
|
}
|
|
|
|
if (this.modifier.filetype != null) {
|
|
fq.append(" AND ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(this.modifier.filetype).append('\"');
|
|
}
|
|
|
|
if (this.contentdom == ContentDomain.IMAGE) {
|
|
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\")");
|
|
}
|
|
|
|
if (this.contentdom == ContentDomain.AUDIO) {
|
|
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aif\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aiff\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp3\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"ogg\")");
|
|
}
|
|
|
|
if (this.contentdom == ContentDomain.VIDEO) {
|
|
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mpg\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"avi\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp4\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mkv\")");
|
|
}
|
|
|
|
if (this.contentdom == ContentDomain.APP) {
|
|
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"apk\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"exe\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"dmg\"");
|
|
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"gz\")");
|
|
}
|
|
|
|
if (this.inlink != null) {
|
|
fq.append(" AND ").append(CollectionSchema.outboundlinks_urlstub_txt.getSolrFieldName()).append(":\"").append(this.inlink).append('\"');
|
|
}
|
|
|
|
if (!this.urlMask_isCatchall) {
|
|
// add a filter query on urls
|
|
String urlMaskPattern = this.urlMask.pattern();
|
|
|
|
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
|
|
int p;
|
|
while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);
|
|
while ((p = urlMaskPattern.indexOf('/')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);
|
|
while ((p = urlMaskPattern.indexOf('\\')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 2);
|
|
fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + urlMaskPattern + "/");
|
|
}
|
|
|
|
if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) {
|
|
// localtion search, no special ranking
|
|
// try http://localhost:8090/solr/select?q=*:*&fq={!bbox sfield=coordinate_p pt=50.17,8.65 d=1}
|
|
|
|
//params.setQuery("!bbox " + q.toString());
|
|
//params.set("sfield", YaCySchema.coordinate_p.name());
|
|
//params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon));
|
|
//params.set("d", GeoLocation.degreeToKm(this.radius));
|
|
fq.append(" AND ").append("{!bbox sfield=" + CollectionSchema.coordinate_p.getSolrFieldName() + " pt=" + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d=" + GeoLocation.degreeToKm(this.radius) + "}");
|
|
//params.setRows(Integer.MAX_VALUE);
|
|
} else {
|
|
// set ranking
|
|
if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) {
|
|
// set a most-recent ordering
|
|
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
|
|
//params.setSortField(CollectionSchema.last_modified.getSolrFieldName(), ORDER.desc); // deprecated in Solr 4.2
|
|
}
|
|
}
|
|
|
|
if (this.modifier.collection != null && this.modifier.collection.length() > 0) {
|
|
fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.modifier.collection));
|
|
}
|
|
|
|
if (fq.length() > 0) {
|
|
params.setFilterQueries(fq.substring(5));
|
|
}
|
|
|
|
// prepare result
|
|
Log.logInfo("Protocol", "SOLR QUERY: " + params.toString());
|
|
this.cachedQuery = params;
|
|
return params;
|
|
}
|
|
|
|
public QueryGoal getQueryGoal() {
|
|
return this.queryGoal;
|
|
}
|
|
|
|
public final Map<DigestURI, String> separateMatches(final Map<DigestURI, String> links) {
|
|
final Map<DigestURI, String> matcher = new HashMap<DigestURI, String>();
|
|
final Iterator <Map.Entry<DigestURI, String>> i = links.entrySet().iterator();
|
|
Map.Entry<DigestURI, String> entry;
|
|
DigestURI url;
|
|
String anchorText;
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
url = entry.getKey();
|
|
anchorText = entry.getValue();
|
|
if (matchesText(anchorText)) {
|
|
matcher.put(url, anchorText);
|
|
i.remove();
|
|
}
|
|
}
|
|
return matcher;
|
|
}
|
|
|
|
private volatile String idCacheAnon = null, idCache = null;
|
|
final static private char asterisk = '*';
|
|
public String id(final boolean anonymized) {
|
|
if (anonymized) {
|
|
if (this.idCacheAnon != null) return this.idCacheAnon;
|
|
} else {
|
|
if (this.idCache != null) return this.idCache;
|
|
}
|
|
synchronized (this) {
|
|
// do a Double-Checked Locking
|
|
if (anonymized) {
|
|
if (this.idCacheAnon != null) return this.idCacheAnon;
|
|
} else {
|
|
if (this.idCache != null) return this.idCache;
|
|
}
|
|
// generate a string that identifies a search so results can be re-used in a cache
|
|
final StringBuilder context = new StringBuilder(180);
|
|
if (anonymized) {
|
|
context.append(anonymizedQueryHashes(this.queryGoal.getIncludeHashes()));
|
|
context.append('-');
|
|
context.append(anonymizedQueryHashes(this.queryGoal.getExcludeHashes()));
|
|
} else {
|
|
context.append(hashSet2hashString(this.queryGoal.getIncludeHashes()));
|
|
context.append('-');
|
|
context.append(hashSet2hashString(this.queryGoal.getExcludeHashes()));
|
|
}
|
|
//context.append(asterisk);
|
|
//context.append(this.domType);
|
|
context.append(asterisk);
|
|
context.append(this.contentdom).append(asterisk);
|
|
context.append(this.zonecode).append(asterisk);
|
|
context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk);
|
|
context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk);
|
|
context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk);
|
|
context.append(this.modifier.sitehash).append(asterisk);
|
|
context.append(this.modifier.author).append(asterisk);
|
|
context.append(this.modifier.protocol).append(asterisk);
|
|
context.append(this.modifier.filetype).append(asterisk);
|
|
context.append(this.modifier.collection).append(asterisk);
|
|
context.append(this.modifier.toString()).append(asterisk);
|
|
context.append(this.siteexcludes).append(asterisk);
|
|
context.append(this.targetlang).append(asterisk);
|
|
context.append(this.constraint).append(asterisk);
|
|
context.append(this.maxDistance).append(asterisk);
|
|
context.append(this.tld).append(asterisk);
|
|
context.append(this.inlink).append(asterisk);
|
|
context.append(this.lat).append(asterisk).append(this.lon).append(asterisk).append(this.radius).append(asterisk);
|
|
context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name());
|
|
String result = context.toString();
|
|
if (anonymized) {
|
|
this.idCacheAnon = result;
|
|
} else {
|
|
this.idCache = result;
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* make a query anchor tag
|
|
* @param page
|
|
* @param theQuery
|
|
* @param originalUrlMask
|
|
* @param addToQuery
|
|
* @return
|
|
*/
|
|
public static StringBuilder navurl(final String ext, final int page, final QueryParams theQuery, final String newQueryString) {
|
|
|
|
final StringBuilder sb = navurlBase(ext, theQuery, newQueryString);
|
|
|
|
sb.append(ampersand);
|
|
sb.append("startRecord=");
|
|
sb.append(page * theQuery.itemsPerPage());
|
|
|
|
return sb;
|
|
}
|
|
|
|
public static StringBuilder navurlBase(final String ext, final QueryParams theQuery, final String newQueryString) {
|
|
|
|
final StringBuilder sb = new StringBuilder(120);
|
|
sb.append("/yacysearch.");
|
|
sb.append(ext);
|
|
sb.append("?query=");
|
|
sb.append(newQueryString == null ? theQuery.getQueryGoal().getOriginalQueryString(true) : newQueryString);
|
|
|
|
sb.append(ampersand);
|
|
sb.append("maximumRecords=");
|
|
sb.append(theQuery.itemsPerPage());
|
|
|
|
sb.append(ampersand);
|
|
sb.append("resource=");
|
|
sb.append((theQuery.isLocal()) ? "local" : "global");
|
|
|
|
sb.append(ampersand);
|
|
sb.append("verify=");
|
|
sb.append(theQuery.snippetCacheStrategy == null ? "false" : theQuery.snippetCacheStrategy.toName());
|
|
|
|
sb.append(ampersand);
|
|
sb.append("prefermaskfilter=");
|
|
sb.append(theQuery.prefer);
|
|
|
|
sb.append(ampersand);
|
|
sb.append("cat=href");
|
|
|
|
sb.append(ampersand);
|
|
sb.append("constraint=");
|
|
sb.append((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64());
|
|
|
|
sb.append(ampersand);
|
|
sb.append("contentdom=");
|
|
sb.append(theQuery.contentdom.toString());
|
|
|
|
sb.append(ampersand);
|
|
sb.append("former=");
|
|
sb.append(theQuery.getQueryGoal().getOriginalQueryString(true));
|
|
|
|
return sb;
|
|
}
|
|
|
|
}
|