added query modifier 'on'. This makes it possible to search for date

occurrences within the (web) page documents (not the document
last-modified!). This works only if the solr field dates_in_content_sxt
is enabled. A search request may then have the form "term on:<date>",
like
gift on:24.12.2014
gift on:2014/12/24
* on:2014/12/31
For the date format you may use any kind of human-readable date
representation(!yes!) - the on:<date> parser tries to identify language
and also knows event names, like:
bunny on:eastern
.. as long as the date term has no spaces inside (use a dot). Further
enhancement will be made to accept also strings encapsulated with
quotes.
pull/1/head
Michael Peter Christen 10 years ago
parent 1cfddea578
commit 65125439fe

@ -35,6 +35,8 @@ import java.util.TreeMap;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter;
/** /**
* The purpose of this class exceeds the demands on simple date parsing using a SimpleDateFormat * The purpose of this class exceeds the demands on simple date parsing using a SimpleDateFormat
* because it tries to * because it tries to
@ -494,6 +496,20 @@ public class DateDetection {
return dates; return dates;
} }
public static Date parseLine(String text) {
Date d = null;
try {d = CONFORM.parse(text);} catch (ParseException e) {}
if (d == null) try {d = GenericFormatter.FORMAT_SHORT_DAY.parse(text);} catch (ParseException e) {}
if (d == null) try {d = GenericFormatter.FORMAT_RFC1123_SHORT.parse(text);} catch (ParseException e) {}
if (d == null) try {d = GenericFormatter.FORMAT_ANSIC.parse(text);} catch (ParseException e) {}
if (d == null) {
Set<Date> dd = parse(text);
if (dd.size() >= 1) d = dd.iterator().next();
}
return d;
}
private static LinkedHashSet<Date> parseRawDate(String text) { private static LinkedHashSet<Date> parseRawDate(String text) {
// get parse alternatives for different date styles; we consider that one document uses only one style // get parse alternatives for different date styles; we consider that one document uses only one style
LinkedHashSet<Date> DMYDates = EndianStyle.DMY.parse(text); LinkedHashSet<Date> DMYDates = EndianStyle.DMY.parse(text);

@ -22,6 +22,7 @@ package net.yacy.search.query;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.MultiMapSolrParams; import org.apache.solr.common.params.MultiMapSolrParams;
@ -29,6 +30,7 @@ import org.apache.solr.common.params.MultiMapSolrParams;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.DateDetection;
import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.ISO639;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
@ -37,7 +39,7 @@ import net.yacy.server.serverObjects;
public class QueryModifier { public class QueryModifier {
private final StringBuilder modifier; private final StringBuilder modifier;
public String sitehost, sitehash, filetype, protocol, language, author, collection; public String sitehost, sitehash, filetype, protocol, language, author, collection, on;
public QueryModifier() { public QueryModifier() {
this.sitehash = null; this.sitehash = null;
@ -47,6 +49,7 @@ public class QueryModifier {
this.language = null; this.language = null;
this.author = null; this.author = null;
this.collection = null; this.collection = null;
this.on = null;
this.modifier = new StringBuilder(20); this.modifier = new StringBuilder(20);
} }
@ -142,6 +145,18 @@ public class QueryModifier {
querystring = querystring.replace("collection:" + this.collection, ""); querystring = querystring.replace("collection:" + this.collection, "");
add("collection:" + this.collection); add("collection:" + this.collection);
} }
// parse on-date
final int oni = querystring.indexOf("on:", 0);
if ( oni >= 0 ) {
int ftb = querystring.indexOf(' ', oni);
if ( ftb == -1 ) {
ftb = querystring.length();
}
this.on = querystring.substring(oni + 3, ftb);
querystring = querystring.replace("on:" + this.on, "");
add("on:" + this.on);
}
// parse language // parse language
final int langi = querystring.indexOf("/language/"); final int langi = querystring.indexOf("/language/");
@ -240,6 +255,10 @@ public class QueryModifier {
fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.collection)); fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.collection));
} }
if (this.on != null && this.on.length() > 0 && fq.indexOf(CollectionSchema.dates_in_content_sxt.getSolrFieldName()) < 0) {
fq.append(" AND ").append(QueryModifier.parseOnExpression(this.on));
}
if (this.protocol != null && this.protocol.length() > 0 && fq.indexOf(CollectionSchema.url_protocol_s.getSolrFieldName()) < 0) { if (this.protocol != null && this.protocol.length() > 0 && fq.indexOf(CollectionSchema.url_protocol_s.getSolrFieldName()) < 0) {
fq.append(" AND ").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(":\"").append(this.protocol).append('\"'); fq.append(" AND ").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(":\"").append(this.protocol).append('\"');
} }
@ -295,6 +314,15 @@ public class QueryModifier {
filterQuery.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(":\"").append(sites.get(0)).append('\"'); filterQuery.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(":\"").append(sites.get(0)).append('\"');
} }
return filterQuery.toString(); return filterQuery.toString();
}
public static String parseOnExpression(String onDescription) {
Date onDate = DateDetection.parseLine(onDescription);
StringBuilder filterQuery = new StringBuilder(20);
if (onDate != null) {
filterQuery.append(CollectionSchema.dates_in_content_sxt.getSolrFieldName()).append(":\"").append(org.apache.solr.schema.TrieDateField.formatExternal(onDate)).append('\"');
}
return filterQuery.toString();
} }

@ -481,6 +481,10 @@ public final class QueryParams {
fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.modifier.collection)); fq.append(" AND ").append(QueryModifier.parseCollectionExpression(this.modifier.collection));
} }
if (this.modifier.on != null && this.modifier.on.length() > 0 && this.solrSchema.contains(CollectionSchema.dates_in_content_sxt)) {
fq.append(" AND ").append(QueryModifier.parseOnExpression(this.modifier.on));
}
if (this.modifier.protocol != null) { if (this.modifier.protocol != null) {
fq.append(" AND {!tag=").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append("}").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(':').append(this.modifier.protocol); fq.append(" AND {!tag=").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append("}").append(CollectionSchema.url_protocol_s.getSolrFieldName()).append(':').append(this.modifier.protocol);
} }

Loading…
Cancel
Save