From e6907fdab37bb05d9766b2ea67a9d25f45e7f42c Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 23 Dec 2017 18:56:17 +0100 Subject: [PATCH] Added optional search parameter/setting to control content domain filter Thus allowing to choose at configuration or per search request, whether extending or not results beyond strict content domain filter (image, video, audio or application). Related graphical controls to be added to user interface. --- defaults/yacy.init | 11 +++ htroot/yacy/search.java | 3 + htroot/yacysearch.java | 6 ++ htroot/yacysearchitem.java | 2 +- source/net/yacy/peers/Protocol.java | 8 +++ source/net/yacy/peers/RemoteSearch.java | 9 ++- .../net/yacy/search/SwitchboardConstants.java | 10 +++ source/net/yacy/search/query/QueryGoal.java | 59 ++++++++++------ source/net/yacy/search/query/QueryParams.java | 69 +++++++++++++++---- source/net/yacy/search/query/SearchEvent.java | 59 +++++++++++----- 10 files changed, 182 insertions(+), 54 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 1d8c67449..a13b1a055 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -843,6 +843,17 @@ search.audio = false search.video = false search.app = false +# Strict content domain filtering : when false, results can be extended to documents including links to documents +# of contentdom type, whithout being themselves of that type. +# Examples : +# - contentdom search param == image, strictContentDom == true +# - jpeg image : acceptable result +# - html page embedding images : rejected +# - contentdom search param == image, strictContentDom == false +# - jpeg image : acceptable result +# - html page embedding images : acceptable result +search.strictContentDom = false + # number of search results per page displayed by default search.items = 10 diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 46bfef3a4..ed810d475 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -116,6 +116,7 @@ public final class search { final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); final String contentdom = post.get("contentdom", "all"); + final boolean strictContentDom = post.getBoolean("strictContentDom"); final String filter = post.get("filter", ".*"); // a filter on the url final int timezoneOffset = post.getInt("timezoneOffset", 0); QueryModifier modifier = new QueryModifier(timezoneOffset); @@ -255,6 +256,7 @@ public final class search { 0.0d, new String[0] ); + theQuery.setStrictContentDom(strictContentDom); Network.log.info("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links"); final long timer = System.currentTimeMillis(); @@ -319,6 +321,7 @@ public final class search { 0.0d, new String[0] ); + theQuery.setStrictContentDom(strictContentDom); Network.log.info("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links"); EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), "")); if (sb.getConfigBool(SwitchboardConstants.DECORATION_AUDIO, false)) Audio.Soundclip.remotesearch.play(-10.0f); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 37415d878..df72f3fb7 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -290,6 +290,11 @@ public class yacysearch { // find search domain final Classification.ContentDomain contentdom = post == null || !post.containsKey("contentdom") ? ContentDomain.ALL : ContentDomain.contentdomParser(post.get("contentdom", "all")); + + // Strict/extended content domain constraint : configured setting may be overriden by request param + final boolean strictContentDom = !Boolean.FALSE.toString().equalsIgnoreCase(post.get("strictContentDom", + sb.getConfig(SwitchboardConstants.SEARCH_STRICT_CONTENT_DOM, + String.valueOf(SwitchboardConstants.SEARCH_STRICT_CONTENT_DOM_DEFAULT)))); // check the search tracker TreeSet trackerHandles = sb.localSearchTracker.get(client); @@ -692,6 +697,7 @@ public class yacysearch { header.get(HeaderFramework.USER_AGENT, ""), lat, lon, rad, sb.getConfigArray("search.navigation", "")); + theQuery.setStrictContentDom(strictContentDom); theQuery.setStandardFacetsMaxCount(sb.getConfigInt(SwitchboardConstants.SEARCH_NAVIGATION_MAXCOUNT, QueryParams.FACETS_STANDARD_MAXCOUNT_DEFAULT)); theQuery.setDateFacetMaxCount(sb.getConfigInt(SwitchboardConstants.SEARCH_NAVIGATION_DATES_MAXCOUNT, diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index d88d14751..62922428b 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -605,7 +605,7 @@ public class yacysearchitem { final SearchEvent theSearch, final String target_special_pattern, long timeout, boolean fullViewingRights, final boolean noreferrer) { prop.put("content", theSearch.query.contentdom.getCode() + 1); // switch on specific content try { - SearchEvent.ImageResult image = theSearch.oneImageResult(item, timeout); + SearchEvent.ImageResult image = theSearch.oneImageResult(item, timeout, theSearch.query.isStrictContentDom()); final String imageUrlstring = image.imageUrl.toNormalform(true); final String imageUrlExt = MultiProtocolURL.getFileExtension(image.imageUrl.getFileName()); final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 92bbd9e96..4ec2677e3 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -488,6 +488,7 @@ public final class Protocol { final String excludehashes, final String language, final ContentDomain contentdom, + final boolean strictContentDom, final int count, final long time, final int maxDistance, @@ -533,6 +534,7 @@ public final class Protocol { "", language, contentdom, + strictContentDom, count, time, maxDistance, @@ -600,6 +602,7 @@ public final class Protocol { final String wordhashes, final String urlhashes, final ContentDomain contentdom, + final boolean strictContentDom, final int count, final long time, final int maxDistance, @@ -624,6 +627,7 @@ public final class Protocol { urlhashes, "", contentdom, + strictContentDom, count, time, maxDistance, @@ -889,6 +893,7 @@ public final class Protocol { final String urlhashes, final String language, final ContentDomain contentdom, + final boolean strictContentDom, final int count, final long time, final int maxDistance, @@ -941,6 +946,9 @@ public final class Protocol { //parts.put("sitehost", UTF8.StringBody(event.query.modifier.sitehost)); parts.put("author", UTF8.StringBody(event.query.modifier.author)); parts.put("contentdom", UTF8.StringBody(contentdom == null ? ContentDomain.ALL.toString() : contentdom.toString())); + if(strictContentDom) { + parts.put("strictContentDom", UTF8.StringBody("true")); + } parts.put("maxdist", UTF8.StringBody(Integer.toString(maxDistance))); parts.put("profile", UTF8.StringBody(crypt.simpleEncode(event.query.ranking.toExternalString()))); parts.put("constraint", UTF8.StringBody((event.query.constraint == null) ? "" : event.query.constraint.exportB64())); diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 830d8ab30..36dd69150 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -61,6 +61,7 @@ public class RemoteSearch extends Thread { final private SearchEvent event; final private String wordhashes, excludehashes; final private ContentDomain contentdom; + final private boolean strictContentDom; final private int partitions; final private SecondarySearchSuperviser secondarySearchSuperviser; final private Blacklist blacklist; @@ -78,6 +79,7 @@ public class RemoteSearch extends Thread { final String excludehashes, final String language, final ContentDomain contentdom, + final boolean strictContentDom, final int count, final long time, final int maxDistance, @@ -91,6 +93,7 @@ public class RemoteSearch extends Thread { this.excludehashes = excludehashes; this.language = language; this.contentdom = contentdom; + this.strictContentDom = strictContentDom; this.partitions = partitions; this.secondarySearchSuperviser = secondarySearchSuperviser; this.blacklist = blacklist; @@ -114,6 +117,7 @@ public class RemoteSearch extends Thread { this.excludehashes, this.language, this.contentdom, + this.strictContentDom, this.count, this.time, this.maxDistance, @@ -264,7 +268,8 @@ public class RemoteSearch extends Thread { // start solr searches final int targets = dhtPeers.size() + robinsonPeers.size(); if (!sb.getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) { - final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, useFacets, event.excludeintext_image); + final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, + event.query.isStrictContentDom(), useFacets, event.excludeintext_image); for (Seed s: robinsonPeers) { if (MemoryControl.shortStatus() || Memory.load() > sb.getConfigFloat(SwitchboardConstants.REMOTESEARCH_MAXLOAD_SOLR, @@ -292,6 +297,7 @@ public class RemoteSearch extends Thread { QueryParams.hashSet2hashString(event.query.getQueryGoal().getExcludeHashes()), event.query.targetlang == null ? "" : event.query.targetlang, event.query.contentdom == null ? ContentDomain.ALL : event.query.contentdom, + event.query.isStrictContentDom(), count, time, event.query.maxDistance, @@ -336,6 +342,7 @@ public class RemoteSearch extends Thread { QueryParams.hashSet2hashString(wordhashes), urlhashes, ContentDomain.ALL, + false, 20, time, 999, diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 71b91cab9..38e86e435 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -560,6 +560,16 @@ public final class SwitchboardConstants { public static final String SEARCH_VERIFY = "search.verify"; public static final String SEARCH_VERIFY_DELETE = "search.verify.delete"; + /** + * Key of the setting controlling whether content domain filtering is strict : + * when false, results can be extended to documents including links to documents + * of contentdom type, whithout being themselves of that type. + */ + public static final String SEARCH_STRICT_CONTENT_DOM = "search.strictContentDom"; + + /** Default setting value controlling whether content domain filtering is strict. */ + public static final boolean SEARCH_STRICT_CONTENT_DOM_DEFAULT = false; + /** Key of the setting controlling whether search results resorting by browser JavaScript is enabled */ public static final String SEARCH_JS_RESORT = "search.jsresort"; diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index b1ec4a5ec..9c94540fa 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -368,41 +368,49 @@ public class QueryGoal { /** * Generate a Solr filter query to receive valid image results. * - * This filters error-urls out and includes urls with mime image/* as well - * as urls with links to images. + * This filters error-urls out and includes urls with mime image/*, as well + * as urls with links to images when strict is false. * We use the mime (image/*) only to find images as the parser assigned the * best mime to index documents. This applies also to parsed file systems. * This ensures that no text urls with image-fileextension is returned * (as some large internet sites like to use such urls) * + * @param strict when true, do not include non-image urls with links to images * @return Solr filter query for image urls */ - public List collectionImageFilterQuery() { + public List collectionImageFilterQuery(final boolean strict) { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK); - fqs.add( - CollectionSchema.content_type.getSolrFieldName() + ":(image/*) OR " + - CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); + StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*)"); + if (!strict) { + filter.append(" OR ").append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()) + .append(AbstractSolrConnector.CATCHALL_DTERM); + } + fqs.add(filter.toString()); return fqs; } /** - * Generate Solr filter queries to receive valid video content results. + * Generate Solr filter queries to receive valid audio content results. * - * This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix video/* as well - * docuemnts with links to video content. + * This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix audio/* as well + * documents with links to audio content when strict is false. * - * @return Solr filter queries for video content URLs + * @param strict when true, do not include non-audio urls with links to audio + * @return Solr filter queries for audio content URLs */ - public List collectionAudioFilterQuery() { + public List collectionAudioFilterQuery(final boolean strict) { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK); - fqs.add(CollectionSchema.content_type.getSolrFieldName() + ":(audio/*) OR " - + CollectionSchema.audiolinkscount_i.getSolrFieldName() + ":[1 TO *]"); + StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()).append(":(audio/*)"); + if (!strict) { + filter.append(" OR ").append(CollectionSchema.audiolinkscount_i.getSolrFieldName()).append(":[1 TO *]"); + } + fqs.add(filter.toString()); return fqs; } @@ -410,17 +418,21 @@ public class QueryGoal { * Generate Solr filter queries to receive valid video content results. * * This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix video/* as well - * docuemnts with links to video content. + * documents with links to video content when strict is false. * + * @param strict when true, do not include non-video urls with links to video * @return Solr filter queries for video content URLs */ - public List collectionVideoFilterQuery() { + public List collectionVideoFilterQuery(final boolean strict) { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK); - fqs.add(CollectionSchema.content_type.getSolrFieldName() + ":(video/*) OR " - + CollectionSchema.videolinkscount_i.getSolrFieldName() + ":[1 TO *]"); + StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()).append(":(video/*)"); + if (!strict) { + filter.append(" OR ").append(CollectionSchema.videolinkscount_i.getSolrFieldName()).append(":[1 TO *]"); + } + fqs.add(filter.toString()); return fqs; } @@ -428,17 +440,22 @@ public class QueryGoal { * Generate Solr filter queries to receive valid application specific content results. * * This filters out documents with bad HTTP status and includes documents with MIME type matching the prefix application/* as well - * docuemnts with links to application specific content. + * docuemnts with links to application specific content when strict is false. * + * @param strict when true, do not include non-video urls with links to video * @return Solr filter queries for application specific content URLs */ - public List collectionApplicationFilterQuery() { + public List collectionApplicationFilterQuery(final boolean strict) { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":" + HttpStatus.SC_OK); - fqs.add(CollectionSchema.content_type.getSolrFieldName() + ":(application/*) OR " - + CollectionSchema.applinkscount_i.getSolrFieldName() + ":[1 TO *]"); + StringBuilder filter = new StringBuilder(CollectionSchema.content_type.getSolrFieldName()) + .append(":(application/*)"); + if (!strict) { + filter.append(" OR ").append(CollectionSchema.applinkscount_i.getSolrFieldName()).append(":[1 TO *]"); + } + fqs.add(filter.toString()); return fqs; } diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 6a6572096..c757e2bff 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -132,7 +132,31 @@ public final class QueryParams { /** true when the urlMasString is just a catch all pattern such as ".*" */ boolean urlMask_isCatchall; + + /** Content-Type classification of expected results */ public final Classification.ContentDomain contentdom; + + /** + *

When false, results can be extended to documents including links to documents + * of {@link #contentdom} type, whithout being themselves of that type.

+ * Examples : + *
    + *
  • contentdom == IMAGE, strictContentDom == true + *
      + *
    • jpeg image : acceptable result
    • + *
    • html page embedding images : rejected
    • + *
    + *
  • + *
  • contentdom == IMAGE, strictContentDom == false + *
      + *
    • jpeg image : acceptable result
    • + *
    • html page embedding images : acceptable result
    • + *
    + *
  • + *
+ */ + private boolean strictContentDom = false; + public final String targetlang; protected final Collection metatags; public final Searchdom domType; @@ -380,6 +404,20 @@ public final class QueryParams { public void setDateFacetMaxCount(final int dateFacetMaxCount) { this.dateFacetMaxCount = dateFacetMaxCount; } + + /** + * @return false when results can be extended to documents including links to documents ot contentdom type. + */ + public boolean isStrictContentDom() { + return this.strictContentDom; + } + + /** + * @param strictContentDom when false, results can be extended to documents including links to documents ot contentdom type. + */ + public void setStrictContentDom(final boolean strictContentDom) { + this.strictContentDom = strictContentDom; + } public static HandleSet hashes2Set(final String query) { final HandleSet keyhashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); @@ -513,20 +551,20 @@ public final class QueryParams { return SetTools.anymatchByTest(keywords, textwords); } - public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) { + public SolrQuery solrQuery(final ContentDomain cd, final boolean strictContentDom, final boolean getFacets, final boolean excludeintext_image) { if (cd == ContentDomain.IMAGE) { - return solrImageQuery(getFacets); + return solrImageQuery(getFacets, strictContentDom); } final List filterQueries; switch (cd) { case AUDIO: - filterQueries = this.queryGoal.collectionAudioFilterQuery(); + filterQueries = this.queryGoal.collectionAudioFilterQuery(strictContentDom); break; case VIDEO: - filterQueries = this.queryGoal.collectionVideoFilterQuery(); + filterQueries = this.queryGoal.collectionVideoFilterQuery(strictContentDom); break; case APP: - filterQueries = this.queryGoal.collectionApplicationFilterQuery(); + filterQueries = this.queryGoal.collectionApplicationFilterQuery(strictContentDom); break; default: filterQueries = this.queryGoal.collectionTextFilterQuery(excludeintext_image); @@ -579,7 +617,7 @@ public final class QueryParams { return params; } - private SolrQuery solrImageQuery(boolean getFacets) { + private SolrQuery solrImageQuery(final boolean getFacets, final boolean strictContentDom) { if (this.cachedQuery != null) { this.cachedQuery.setStart(this.offset); if (!getFacets) this.cachedQuery.setFacet(false); @@ -587,16 +625,18 @@ public final class QueryParams { } // construct query - final SolrQuery params = getBasicParams(getFacets, this.queryGoal.collectionImageFilterQuery()); + final SolrQuery params = getBasicParams(getFacets, this.queryGoal.collectionImageFilterQuery(strictContentDom)); params.setQuery(this.queryGoal.collectionImageQuery(this.modifier).toString()); - // set boosts - StringBuilder bq = new StringBuilder(); - bq.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\""); - bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\""); - bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\""); - bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\""); - params.setParam(DisMaxParams.BQ, bq.toString()); + if(!strictContentDom) { + // set boosts + StringBuilder bq = new StringBuilder(); + bq.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\""); + bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\""); + bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\""); + bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\""); + params.setParam(DisMaxParams.BQ, bq.toString()); + } // prepare result ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString()); @@ -810,6 +850,7 @@ public final class QueryParams { //context.append(this.domType); context.append(asterisk); context.append(this.contentdom).append(asterisk); + context.append(this.strictContentDom).append(asterisk); context.append(this.zonecode).append(asterisk); context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk); context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 7fd642956..a46b5b290 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -428,7 +428,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { final boolean useSolrFacets = true; this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, - this.query.solrQuery(this.query.contentdom, useSolrFacets, this.excludeintext_image), this.query.offset, + this.query.solrQuery(this.query.contentdom, this.query.isStrictContentDom(), useSolrFacets, this.excludeintext_image), this.query.offset, this.query.itemsPerPage, null /* this peer */, 0, Switchboard.urlBlacklist, useSolrFacets, true); } this.localsolroffset = this.query.offset + this.query.itemsPerPage; @@ -734,13 +734,27 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // check document domain - if (this.query.contentdom.getCode() > 0 && - ((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || - (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || - (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || - (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp))))) { - if (log.isFine()) log.fine("dropped RWI: contentdom fail"); - continue pollloop; + if (this.query.contentdom.getCode() > 0) { + boolean domainMatch = true; + if(this.query.isStrictContentDom()) { + if((this.query.contentdom == ContentDomain.AUDIO && iEntry.getType() != Response.DT_AUDIO) || + (this.query.contentdom == ContentDomain.VIDEO && iEntry.getType() != Response.DT_MOVIE) || + (this.query.contentdom == ContentDomain.IMAGE && iEntry.getType() != Response.DT_IMAGE) || + (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { + domainMatch = false; + } + } else if((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || + (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || + (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || + (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { + domainMatch = false; + } + if(!domainMatch) { + if (log.isFine()) { + log.fine("dropped RWI: contentdom fail"); + } + continue pollloop; + } } // check language @@ -1003,14 +1017,25 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // check document domain - if (this.query.contentdom.getCode() > 0 && - ((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || + if (this.query.contentdom.getCode() > 0) { + boolean domainMatch = true; + if(this.query.isStrictContentDom()) { + if(this.query.contentdom != iEntry.getContentDomain()) { + domainMatch = false; + } + } else if((this.query.contentdom == ContentDomain.AUDIO && !(flags.get(Tokenizer.flag_cat_hasaudio))) || (this.query.contentdom == ContentDomain.VIDEO && !(flags.get(Tokenizer.flag_cat_hasvideo))) || (this.query.contentdom == ContentDomain.IMAGE && !(flags.get(Tokenizer.flag_cat_hasimage))) || - (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp))))) { - if (log.isFine()) log.fine("dropped Node: content domain does not match"); - updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); - continue pollloop; + (this.query.contentdom == ContentDomain.APP && !(flags.get(Tokenizer.flag_cat_hasapp)))) { + domainMatch = false; + } + if(!domainMatch) { + if (log.isFine()) { + log.fine("dropped Node: content domain does not match"); + } + updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); + continue pollloop; + } } // filter out media links in text search, if wanted @@ -2113,7 +2138,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { final boolean useSolrFacets = (this.localsolrsearch == null); final boolean incrementNavigators = false; this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, - this.query.solrQuery(this.query.contentdom, useSolrFacets, this.excludeintext_image), + this.query.solrQuery(this.query.contentdom, this.query.isStrictContentDom(), useSolrFacets, this.excludeintext_image), this.localsolroffset, nextitems, null /* this peer */, 0, Switchboard.urlBlacklist, useSolrFacets, incrementNavigators); } this.localsolroffset += nextitems; @@ -2204,7 +2229,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { return null; } - public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException { + public ImageResult oneImageResult(final int item, final long timeout, final boolean strictContentDom) throws MalformedURLException { if (item < imageViewed.size()) return nthImage(item); if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare URIMetadataNode doc = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare @@ -2233,7 +2258,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(doc.url(), doc.url(), doc.mime(), doc.title(), w, h, 0)); } } - } else { + } else if(!strictContentDom) { Collection altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); Collection imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); if (imgO != null && imgO.size() > 0 && imgO instanceof List) {