You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
389 lines
17 KiB
389 lines
17 KiB
/**
|
|
* QueryModifier
|
|
* Copyright 2013 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
* First published 12.02.2013 on http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.search.query;
|
|
|
|
import java.net.MalformedURLException;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
|
|
import org.apache.solr.common.params.CommonParams;
|
|
import org.apache.solr.common.params.MultiMapSolrParams;
|
|
import org.apache.solr.util.DateFormatUtil;
|
|
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.util.CommonPattern;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.document.DateDetection;
|
|
import net.yacy.kelondro.util.ISO639;
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
import net.yacy.server.serverObjects;
|
|
|
|
|
|
public class QueryModifier {
|
|
|
|
private final StringBuilder modifier;
|
|
public String sitehost, sitehash, filetype, protocol, language, author, collection, on, from, to;
|
|
public int timezoneOffset;
|
|
|
|
public QueryModifier(final int timezoneOffset) {
|
|
this.timezoneOffset = timezoneOffset;
|
|
this.sitehash = null;
|
|
this.sitehost = null;
|
|
this.filetype = null;
|
|
this.protocol = null;
|
|
this.language = null;
|
|
this.author = null;
|
|
this.collection = null;
|
|
this.on = null;
|
|
this.from = null;
|
|
this.to = null;
|
|
this.modifier = new StringBuilder(20);
|
|
}
|
|
|
|
/**
|
|
* Parse query string for query modifier parameters
|
|
* @param querystring
|
|
* @return query string with parameters removed
|
|
*/
|
|
public String parse(String querystring) {
|
|
|
|
// parse protocol
|
|
if ( querystring.indexOf("/https", 0) >= 0 ) {
|
|
querystring = querystring.replace("/https", "");
|
|
this.protocol = "https";
|
|
add("/https");
|
|
} else if ( querystring.indexOf("/http", 0) >= 0 ) {
|
|
querystring = querystring.replace("/http", "");
|
|
this.protocol = "http";
|
|
add("/http");
|
|
}
|
|
if ( querystring.indexOf("/ftp", 0) >= 0 ) {
|
|
querystring = querystring.replace("/ftp", "");
|
|
this.protocol = "ftp";
|
|
add("/ftp");
|
|
}
|
|
if ( querystring.indexOf("/smb", 0) >= 0 ) {
|
|
querystring = querystring.replace("/smb", "");
|
|
this.protocol = "smb";
|
|
add("/smb");
|
|
}
|
|
if ( querystring.indexOf("/file", 0) >= 0 ) {
|
|
querystring = querystring.replace("/file", "");
|
|
this.protocol = "file";
|
|
add("/file");
|
|
}
|
|
|
|
// parse 'common search mistakes' like guessed regular expressions
|
|
// (changes "abc*" to "ab", "abc *def" to "abcdef") TODO: handle most common "abc*"
|
|
int p = querystring.indexOf('*');
|
|
if ((p >= 0) && ((p > 0 && querystring.charAt(p - 1) != ' ') || (p > 1 && p < querystring.length() - 1 && querystring.charAt(p + 1) != ' '))) {
|
|
querystring = querystring.substring(0, p - 1) + querystring.substring(p + 1);
|
|
}
|
|
|
|
// parse filetype
|
|
querystring = filetypeParser(querystring, "filetype:");
|
|
|
|
// parse site
|
|
final int sp = querystring.indexOf("site:", 0);
|
|
if (sp >= 0) {
|
|
int ftb = querystring.indexOf(' ', sp);
|
|
if ( ftb == -1 ) {
|
|
ftb = querystring.length();
|
|
}
|
|
this.sitehost = querystring.substring(sp + 5, ftb);
|
|
querystring = querystring.replace("site:" + this.sitehost, "");
|
|
while ( this.sitehost.length() > 0 && this.sitehost.charAt(0) == '.' ) {
|
|
this.sitehost = this.sitehost.substring(1);
|
|
}
|
|
while ( sitehost.endsWith(".") ) {
|
|
this.sitehost = this.sitehost.substring(0, this.sitehost.length() - 1);
|
|
}
|
|
try {
|
|
this.sitehash = DigestURL.hosthash(this.sitehost, this.sitehost.startsWith("ftp.") ? 21 : 80);
|
|
} catch (MalformedURLException e) {
|
|
this.sitehash = "";
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
add("site:" + this.sitehost);
|
|
}
|
|
|
|
// parse author
|
|
final int authori = querystring.indexOf("author:", 0);
|
|
if (authori >= 0) {
|
|
// check if the author was given with single quotes or without
|
|
final boolean quotes = (querystring.charAt(authori + 7) == '(');
|
|
if ( quotes ) {
|
|
int ftb = querystring.indexOf(')', authori + 8);
|
|
this.author = querystring.substring(authori + 8, ftb == -1 ? querystring.length() : ftb);
|
|
querystring = querystring.replace("author:(" + this.author + ")", "");
|
|
add("author:(" + author + ")");
|
|
} else {
|
|
int ftb = querystring.indexOf(' ', authori);
|
|
if ( ftb == -1 ) {
|
|
ftb = querystring.length();
|
|
}
|
|
this.author = querystring.substring(authori + 7, ftb);
|
|
querystring = querystring.replace("author:" + this.author, "").replace(" ", " ").trim();
|
|
add("author:" + author);
|
|
}
|
|
}
|
|
|
|
// parse collection
|
|
int collectioni = querystring.indexOf("collection:", 0);
|
|
while (collectioni >= 0) { // due to possible collision with "on:" modifier make sure no "collection:" remains
|
|
int ftb = querystring.indexOf(' ', collectioni);
|
|
this.collection = querystring.substring(collectioni + 11, ftb == -1 ? querystring.length() : ftb);
|
|
querystring = querystring.replace("collection:" + this.collection, "").replace(" ", " ").trim();
|
|
collectioni = querystring.indexOf("collection:", 0);
|
|
}
|
|
if (this.collection != null) add("collection:" + this.collection);
|
|
|
|
// parse on-date, must be after "collection:" as "on:" contained in it
|
|
final int oni = querystring.indexOf("on:", 0);
|
|
if (oni >= 0) {
|
|
int ftb = querystring.indexOf(' ', oni);
|
|
this.on = querystring.substring(oni + 3, ftb == -1 ? querystring.length() : ftb);
|
|
querystring = querystring.replace("on:" + this.on, "").replace(" ", " ").trim();
|
|
add("on:" + this.on);
|
|
}
|
|
|
|
// parse from-date
|
|
final int fromi = querystring.indexOf("from:", 0);
|
|
if (fromi >= 0) {
|
|
int ftb = querystring.indexOf(' ', fromi);
|
|
this.from = querystring.substring(fromi + 5, ftb == -1 ? querystring.length() : ftb);
|
|
querystring = querystring.replace("from:" + this.from, "").replace(" ", " ").trim();
|
|
add("from:" + this.from);
|
|
}
|
|
|
|
// parse to-date
|
|
final int toi = querystring.indexOf("to:", 0);
|
|
if (toi >= 0) {
|
|
int ftb = querystring.indexOf(' ', toi);
|
|
this.to = querystring.substring(toi + 3, ftb == -1 ? querystring.length() : ftb);
|
|
querystring = querystring.replace("to:" + this.to, "").replace(" ", " ").trim();
|
|
add("to:" + this.to);
|
|
}
|
|
|
|
// parse language
|
|
final int langi = querystring.indexOf("/language/");
|
|
if (langi >= 0) {
|
|
if (querystring.length() >= (langi + 12)) {
|
|
this.language = querystring.substring(langi + 10, langi + 12);
|
|
querystring = querystring.replace("/language/" + this.language, "");
|
|
if (this.language.length() == 2 && ISO639.exists(this.language)) { // only 2-digit codes valid
|
|
this.language = this.language.toLowerCase();
|
|
add("/language/" + this.language);
|
|
} else {
|
|
this.language = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
// check the number of quotes in the string; if there is only one double-quote, add another one. this will prevent error messages in
|
|
p = querystring.indexOf('"');
|
|
if (p >= 0) {
|
|
int q = querystring.indexOf('"', p + 1);
|
|
if (q < 0) querystring += '"';
|
|
}
|
|
|
|
return querystring.trim();
|
|
}
|
|
|
|
/**
|
|
* Parse query string for filetype (file extension) parameter
|
|
* @param querystring
|
|
* @param filetypePrefix "filetype:"
|
|
* @return querystring with filetype parameter removed
|
|
*/
|
|
private String filetypeParser(String querystring, final String filetypePrefix) {
|
|
final int ftp = querystring.indexOf(filetypePrefix, 0);
|
|
if ( ftp >= 0 ) {
|
|
int ftb = querystring.indexOf(' ', ftp);
|
|
if ( ftb < 0 ) ftb = querystring.length();
|
|
filetype = querystring.substring(ftp + filetypePrefix.length(), ftb);
|
|
querystring = querystring.replace(filetypePrefix + filetype, "");
|
|
while ( !filetype.isEmpty() && filetype.charAt(0) == '.' ) {
|
|
filetype = filetype.substring(1);
|
|
}
|
|
add(filetypePrefix + filetype);
|
|
if (filetype.isEmpty()) filetype = null;
|
|
if (querystring.length() == 0) querystring = "*";
|
|
}
|
|
return querystring;
|
|
}
|
|
|
|
public void add(String m) {
|
|
if (modifier.length() > 0 && modifier.charAt(modifier.length() - 1) != ' ' && m != null && m.length() > 0) modifier.append(' ');
|
|
if (m != null) modifier.append(m);
|
|
}
|
|
|
|
public void remove(String m) {
|
|
int p = modifier.indexOf(" " + m);
|
|
if (p >= 0) modifier.delete(p, p + m.length() + 1);
|
|
p = modifier.indexOf(m);
|
|
if (p == 0) modifier.delete(p, p + m.length());
|
|
if (modifier.length() > 0 && modifier.charAt(0) == ' ') modifier.delete(0, 1);
|
|
}
|
|
|
|
@Override
|
|
public String toString() {
|
|
return this.modifier.toString();
|
|
}
|
|
|
|
/**
|
|
* @return true if no modifier active
|
|
*/
|
|
public boolean isEmpty() {
|
|
return this.modifier.length() == 0;
|
|
}
|
|
|
|
private StringBuilder apply(String FQ) {
|
|
|
|
final StringBuilder fq = new StringBuilder(FQ);
|
|
|
|
if (this.sitehost != null && this.sitehost.length() > 0 && fq.indexOf(CollectionSchema.host_s.getSolrFieldName()) < 0) {
|
|
// consider to search for hosts with 'www'-prefix, if not already part of the host name
|
|
if (this.sitehost.startsWith("www.")) {
|
|
fq.append(" AND (").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(this.sitehost.substring(4)).append('\"');
|
|
fq.append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(this.sitehost).append("\")");
|
|
} else {
|
|
fq.append(" AND (").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"").append(this.sitehost).append('\"');
|
|
fq.append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(":\"www.").append(this.sitehost).append("\")");
|
|
}
|
|
}
|
|
if (this.sitehash != null && this.sitehash.length() > 0 && fq.indexOf(CollectionSchema.host_id_s.getSolrFieldName()) < 0) {
|
|
fq.append(" AND ").append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(this.sitehash).append('\"');
|
|
}
|
|
|
|
if (this.filetype != null && this.filetype.length() > 0 && fq.indexOf(CollectionSchema.url_file_ext_s.getSolrFieldName()) < 0) {
|
|
fq.append(" AND ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(this.filetype).append('\"');
|
|
}
|
|
|
|
if (this.author != null && this.author.length() > 0 && fq.indexOf(CollectionSchema.author_sxt.getSolrFieldName()) < 0) {
|
|
fq.append(" AND ").append(CollectionSchema.author_sxt.getSolrFieldName()).append(":\"").append(this.author).append('\"');
|
|
}
|
|
|
|
if (this.collection != null && this.collection.length() > 0 && fq.indexOf(CollectionSchema.collection_sxt.getSolrFieldName()) < 0) {
|
|
fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.collection));
|
|
}
|
|
|
|
if (fq.indexOf(CollectionSchema.dates_in_content_dts.getSolrFieldName()) < 0) {
|
|
if (this.on != null && this.on.length() > 0) {
|
|
fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on, this.timezoneOffset));
|
|
}
|
|
|
|
if (this.from != null && this.from.length() > 0 && (this.to == null || this.to.equals("*"))) {
|
|
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, null, this.timezoneOffset));
|
|
}
|
|
|
|
if ((this.from == null || this.from.equals("*")) && this.to != null && this.to.length() > 0) {
|
|
fq.append(" AND ").append(QueryModifier.parseFromToExpression(null, this.to, this.timezoneOffset));
|
|
}
|
|
|
|
if (this.from != null && this.from.length() > 0 && this.to != null && this.to.length() > 0) {
|
|
fq.append(" AND ").append(QueryModifier.parseFromToExpression(this.from, this.to, this.timezoneOffset));
|
|
}
|
|
}
|
|
|
|
if (this.protocol != null && this.protocol.length() > 0 && fq.indexOf(CollectionSchema.url_protocol_s.getSolrFieldName()) < 0) {
|
|
fq.append(" AND ").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(":\"").append(this.protocol).append('\"');
|
|
}
|
|
|
|
return fq;
|
|
}
|
|
|
|
public void apply(serverObjects post) {
|
|
|
|
final StringBuilder fq = apply(post.get(CommonParams.FQ,""));
|
|
|
|
if (fq.length() > 0) {
|
|
String fqs = fq.toString();
|
|
if (fqs.startsWith(" AND ")) fqs = fqs.substring(5);
|
|
post.remove(CommonParams.FQ);
|
|
post.put(CommonParams.FQ, fqs);
|
|
}
|
|
}
|
|
|
|
public void apply(MultiMapSolrParams mmsp) {
|
|
|
|
final StringBuilder fq = apply(mmsp.get(CommonParams.FQ,""));
|
|
|
|
if (fq.length() > 0) {
|
|
String fqs = fq.toString();
|
|
if (fqs.startsWith(" AND ")) fqs = fqs.substring(5);
|
|
mmsp.getMap().remove(CommonParams.FQ);
|
|
mmsp.getMap().put(CommonParams.FQ, new String[]{fqs});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* parse a GSA site description string and create a filter query string
|
|
* which is used to restrict the search result to collections as named with the site attributes
|
|
* @param collectionDescription
|
|
* @return a solr query string which shall be used for a filter query
|
|
*/
|
|
public static String parseCollectionExpression(String collectionDescription) {
|
|
String[] s0 = CommonPattern.VERTICALBAR.split(collectionDescription);
|
|
ArrayList<String> collections = new ArrayList<String>(2);
|
|
for (String s: s0) {
|
|
s = s.trim();
|
|
if (s.length() > 0) collections.add(s);
|
|
}
|
|
StringBuilder fq = new StringBuilder(20);
|
|
if (collections.size() > 1) {
|
|
fq.append('(').append(CollectionSchema.collection_sxt.getSolrFieldName()).append(":\"").append(collections.get(0)).append('\"');
|
|
for (int i = 1; i < collections.size(); i++) {
|
|
fq.append(" OR ").append(CollectionSchema.collection_sxt.getSolrFieldName()).append(":\"").append(collections.get(i)).append('\"');
|
|
}
|
|
fq.append(')');
|
|
} else if (collections.size() == 1) {
|
|
fq.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(":\"").append(collections.get(0)).append('\"');
|
|
}
|
|
if (fq.length() > 0) fq.insert(0, "{!tag=" + CollectionSchema.collection_sxt.getSolrFieldName() + "}");
|
|
return fq.toString();
|
|
}
|
|
|
|
public static String parseOnExpression(final String onDescription, final int timezoneOffset) {
|
|
assert onDescription != null;
|
|
Date onDate = DateDetection.parseLine(onDescription, timezoneOffset);
|
|
StringBuilder filterQuery = new StringBuilder(20);
|
|
if (onDate != null) {
|
|
String dstr = DateFormatUtil.formatExternal(onDate);
|
|
filterQuery.append(CollectionSchema.dates_in_content_dts.getSolrFieldName()).append(":[").append(dstr).append(" TO ").append(dstr).append(']');
|
|
}
|
|
return filterQuery.toString();
|
|
}
|
|
|
|
public static String parseFromToExpression(final String from, final String to, final int timezoneOffset) {
|
|
Date fromDate = from == null || from.equals("*") ? null : DateDetection.parseLine(from, timezoneOffset);
|
|
Date toDate = to == null || to.equals("*") ? null : DateDetection.parseLine(to, timezoneOffset);
|
|
StringBuilder filterQuery = new StringBuilder(20);
|
|
if (fromDate != null && toDate != null) {
|
|
String dstrFrom = fromDate == null ? "*" : DateFormatUtil.formatExternal(fromDate);
|
|
String dstrTo = toDate == null ? "*" : DateFormatUtil.formatExternal(toDate);
|
|
filterQuery.append(CollectionSchema.dates_in_content_dts.getSolrFieldName()).append(":[").append(dstrFrom).append(" TO ").append(dstrTo).append(']');
|
|
}
|
|
return filterQuery.toString();
|
|
}
|
|
|
|
}
|