Prefer fine URL match over approximate URL mask regex on final filtering

Also prevent adding a redundant and CPU costly Solr url mask filter
query when possible
pull/149/head
luccioman 7 years ago
parent 0a120787e3
commit c9d80b5b77

@ -120,12 +120,17 @@ public final class QueryParams {
private final QueryGoal queryGoal;
public int itemsPerPage;
public int offset;
/** The URL mask pattern compiled from the urlMasString.
* Null when the urlMaskString is not user provided but generated from the query modifiers */
public Pattern urlMaskPattern;
public Automaton urlMaskAutomaton;
public String urlMaskString;
public final Pattern prefer;
public final String tld, inlink;
/** true when the urlMasString is just a catch all pattern such as ".*" */
boolean urlMask_isCatchall;
public final Classification.ContentDomain contentdom;
public final String targetlang;
@ -224,7 +229,9 @@ public final class QueryParams {
this.urlMaskString = filter;
this.urlMaskAutomaton = Automata.makeString(filter);
this.urlMask_isCatchall = false;
this.urlMaskPattern = Pattern.compile(filter);
/* We let here the urlMaskPattern null :
* final URL match checking will be made with the more accurate matchesURL function */
this.urlMaskPattern = null;
}
}
this.tld = tld;
@ -427,6 +434,10 @@ public final class QueryParams {
}
/**
* Check wheter the given URL matches the eventual modifier and top-level domain
* constraints. Should be preferred as more accurate than the url mask pattern generated with
* {@link #buildApproximateURLFilter(QueryModifier, String)}.
*
* @param modifier
* the query modifier with eventual constraints on protocoln, host
* name or file extension
@ -727,8 +738,8 @@ public final class QueryParams {
fqs.add(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() + ":\"" + this.inlink + '\"');
}
if (!this.urlMask_isCatchall) {
// add a filter query on urls
if (!this.urlMask_isCatchall && this.urlMaskPattern != null) {
// add a filter query on urls only if user custom and not generated from other modifiers
fqs.add(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/");
}

@ -973,8 +973,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
continue pollloop;
}
if ( !this.query.urlMask_isCatchall ) {
// check url mask
if ( !this.query.urlMask_isCatchall && this.query.urlMaskPattern != null) {
// check url mask, only when not redundant with query modifier and tld constraints
if (!iEntry.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped Node: url mask does not match");
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
@ -1407,7 +1407,9 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
continue;
}
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
if (!this.query.urlMask_isCatchall && this.query.urlMaskPattern != null
&& !page.matches(this.query.urlMaskPattern)) {
// check url mask, only when not redundant with query modifier and tld constraints
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
decrementCounts(page.word());
continue;

Loading…
Cancel
Save