From 049c3b3f2ee2f7ec9467751cf42328f7b0738eb7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 3 Sep 2013 11:14:23 +0200 Subject: [PATCH] added an option to exclude image search results from text search. This is on by default. --- htroot/gsa/searchresult.java | 2 +- htroot/solr/select.java | 2 +- source/net/yacy/peers/RemoteSearch.java | 2 +- source/net/yacy/search/query/QueryGoal.java | 3 +- source/net/yacy/search/query/QueryParams.java | 38 ++----------------- source/net/yacy/search/query/SearchEvent.java | 26 ++++++++++--- 6 files changed, 30 insertions(+), 43 deletions(-) diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 4528ad6ad..ec7afb048 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -112,7 +112,7 @@ public class searchresult { // get a solr query string QueryGoal qg = new QueryGoal(originalQuery, originalQuery); - StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0); + StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0, false); post.put("defType", "edismax"); post.put(CommonParams.Q, solrQ.toString()); post.put(CommonParams.ROWS, post.remove("num")); diff --git a/htroot/solr/select.java b/htroot/solr/select.java index 311bec918..3d5a1e402 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -168,7 +168,7 @@ public class select { querystring = modifier.parse(querystring); modifier.apply(post); QueryGoal qg = new QueryGoal(querystring, querystring); - StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr); + StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr, false); post.put(CommonParams.Q, solrQ.toString()); // sru patch } String q = post.get(CommonParams.Q, ""); diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 600d63474..73e468d27 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -172,7 +172,7 @@ public class RemoteSearch extends Thread { nodePeers.add(event.peers.mySeed()); } if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) { - final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0); + final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0, event.excludeintext_image); for (Seed s: nodePeers) { Thread t = solrRemoteSearch(event, solrQuery, start, count, s, blacklist); event.nodeSearchThreads.add(t); diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index c564ba698..1d7bea888 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -207,11 +207,12 @@ public class QueryGoal { for (final byte[] b: blues) this.include_hashes.remove(b); } - public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile) { + public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile, boolean noimages) { final StringBuilder q = new StringBuilder(80); // add filter to prevent that results come from failed urls q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); + if (noimages) q.append(" AND -").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif)"); // parse special requests if (isCatchall()) return q; diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index c59ade8d7..588a80ef7 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -376,12 +376,12 @@ public final class QueryParams { return SetTools.anymatch(wordhashes, keyhashes); } - public SolrQuery solrQuery(ContentDomain cd, boolean getFacets) { + public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) { if (cd == ContentDomain.IMAGE) return solrImageQuery(getFacets); - return solrTextQuery(getFacets); + return solrTextQuery(getFacets, excludeintext_image); } - private SolrQuery solrTextQuery(boolean getFacets) { + private SolrQuery solrTextQuery(final boolean getFacets, final boolean excludeintext_image) { if (this.cachedQuery != null) { this.cachedQuery.setStart(this.offset); return this.cachedQuery; @@ -391,7 +391,7 @@ public final class QueryParams { // construct query final SolrQuery params = getBasicParams(getFacets); int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0; - params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile).toString()); + params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile, excludeintext_image).toString()); Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile String bq = ranking.getBoostQuery(); @@ -399,36 +399,6 @@ public final class QueryParams { if (bq.length() > 0) params.setParam("bq", bq); if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29 - /* - if (this.contentdom == ContentDomain.IMAGE) { - fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\")"); - } - - if (this.contentdom == ContentDomain.AUDIO) { - fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aif\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aiff\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp3\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"ogg\")"); - } - - if (this.contentdom == ContentDomain.VIDEO) { - fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mpg\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"avi\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp4\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mkv\")"); - } - - if (this.contentdom == ContentDomain.APP) { - fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"apk\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"exe\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"dmg\""); - fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"gz\")"); - } - */ - // prepare result ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString()); this.cachedQuery = params; diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 6b2dd7736..4224ab8b7 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -164,7 +164,8 @@ public final class SearchEvent { private final WeakPriorityBlockingQueue nodeStack; // thats the bag where the solr results are written to private final WeakPriorityBlockingQueue resultList; // thats the result list where the actual search result is waiting to be displayed private final boolean pollImmediately; // if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source. - + public final boolean excludeintext_image; + // the following values are filled during the search process as statistics for the search public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index public final AtomicInteger local_rwi_stored; // the number of existing hits by the local search in rwi index @@ -220,6 +221,7 @@ public final class SearchEvent { this.nodeStack = new WeakPriorityBlockingQueue(100, false); this.maxExpectedRemoteReferences = new AtomicInteger(0); this.expectedRemoteReferences = new AtomicInteger(0); + this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true); // prepare configured search navigation final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", ""); this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap() : null; @@ -282,7 +284,7 @@ public final class SearchEvent { // start a local solr search if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { - this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist); + this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true, this.excludeintext_image), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist); } this.localsolroffset = this.query.itemsPerPage; @@ -837,6 +839,13 @@ public final class SearchEvent { if (log.isFine()) log.fine("dropped Node: content domain does not match"); continue pollloop; } + + // filter out media links in text search, if wanted + String ext = MultiProtocolURI.getFileExtension(iEntry.url().getFileName()); + if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) { + if (log.isFine()) log.fine("dropped Node: file name domain does not match"); + continue pollloop; + } // check site constraints final String hosthash = iEntry.hosthash(); @@ -1014,7 +1023,7 @@ public final class SearchEvent { } // check content domain - if (((this.query.contentdom == Classification.ContentDomain.TEXT && page.url().getContentDomain() == Classification.ContentDomain.IMAGE) || + if (this.query.contentdom.getCode() > 0 && ( (this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) || (this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) || (this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) || @@ -1024,6 +1033,13 @@ public final class SearchEvent { continue; } + // filter out media links in text search, if wanted + String ext = MultiProtocolURI.getFileExtension(page.url().getFileName()); + if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) { + if (log.isFine()) log.fine("dropped RWI: file name domain does not match"); + continue; + } + // Check for blacklist if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page)) { if (log.isFine()) log.fine("dropped RWI: url is blacklisted in url blacklist"); @@ -1340,7 +1356,7 @@ public final class SearchEvent { int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded. if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}} if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { - this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist); + this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0, this.excludeintext_image), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist); } this.localsolroffset += nextitems; } @@ -1361,7 +1377,7 @@ public final class SearchEvent { if (this.localsolrsearch == null || !this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0) { // at the end of a list, trigger a next solr search if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) { - this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist); + this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0, this.excludeintext_image), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist); } this.localsolroffset += this.query.itemsPerPage; }