From c9d80b5b770935bfb8aba2c8eaf39e785883e43e Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 1 Dec 2017 11:52:52 +0100 Subject: [PATCH] Prefer fine URL match over approximate URL mask regex on final filtering Also prevent adding a redundant and CPU costly Solr url mask filter query when possible --- source/net/yacy/search/query/QueryParams.java | 17 ++++++++++++++--- source/net/yacy/search/query/SearchEvent.java | 8 +++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 3d6567791..6a6572096 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -120,12 +120,17 @@ public final class QueryParams { private final QueryGoal queryGoal; public int itemsPerPage; public int offset; + + /** The URL mask pattern compiled from the urlMasString. + * Null when the urlMaskString is not user provided but generated from the query modifiers */ public Pattern urlMaskPattern; public Automaton urlMaskAutomaton; public String urlMaskString; public final Pattern prefer; public final String tld, inlink; + + /** true when the urlMasString is just a catch all pattern such as ".*" */ boolean urlMask_isCatchall; public final Classification.ContentDomain contentdom; public final String targetlang; @@ -224,7 +229,9 @@ public final class QueryParams { this.urlMaskString = filter; this.urlMaskAutomaton = Automata.makeString(filter); this.urlMask_isCatchall = false; - this.urlMaskPattern = Pattern.compile(filter); + /* We let here the urlMaskPattern null : + * final URL match checking will be made with the more accurate matchesURL function */ + this.urlMaskPattern = null; } } this.tld = tld; @@ -427,6 +434,10 @@ public final class QueryParams { } /** + * Check wheter the given URL matches the eventual modifier and top-level domain + * constraints. Should be preferred as more accurate than the url mask pattern generated with + * {@link #buildApproximateURLFilter(QueryModifier, String)}. + * * @param modifier * the query modifier with eventual constraints on protocoln, host * name or file extension @@ -727,8 +738,8 @@ public final class QueryParams { fqs.add(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() + ":\"" + this.inlink + '\"'); } - if (!this.urlMask_isCatchall) { - // add a filter query on urls + if (!this.urlMask_isCatchall && this.urlMaskPattern != null) { + // add a filter query on urls only if user custom and not generated from other modifiers fqs.add(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/"); } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 167bf9ece..7fd642956 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -973,8 +973,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener { continue pollloop; } - if ( !this.query.urlMask_isCatchall ) { - // check url mask + if ( !this.query.urlMask_isCatchall && this.query.urlMaskPattern != null) { + // check url mask, only when not redundant with query modifier and tld constraints if (!iEntry.matches(this.query.urlMaskPattern)) { if (log.isFine()) log.fine("dropped Node: url mask does not match"); updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); @@ -1407,7 +1407,9 @@ public final class SearchEvent implements ScoreMapUpdatesListener { continue; } - if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) { + if (!this.query.urlMask_isCatchall && this.query.urlMaskPattern != null + && !page.matches(this.query.urlMaskPattern)) { + // check url mask, only when not redundant with query modifier and tld constraints if (log.isFine()) log.fine("dropped RWI: no match with urlMask"); decrementCounts(page.word()); continue;