added an option to exclude image search results from text search. This

is on by default.
pull/1/head
Michael Peter Christen 12 years ago
parent 69f85265e1
commit 049c3b3f2e

@ -112,7 +112,7 @@ public class searchresult {
// get a solr query string
QueryGoal qg = new QueryGoal(originalQuery, originalQuery);
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0);
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), 0, false);
post.put("defType", "edismax");
post.put(CommonParams.Q, solrQ.toString());
post.put(CommonParams.ROWS, post.remove("num"));

@ -168,7 +168,7 @@ public class select {
querystring = modifier.parse(querystring);
modifier.apply(post);
QueryGoal qg = new QueryGoal(querystring, querystring);
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr);
StringBuilder solrQ = qg.collectionTextQueryString(sb.index.fulltext().getDefaultConfiguration(), profileNr, false);
post.put(CommonParams.Q, solrQ.toString()); // sru patch
}
String q = post.get(CommonParams.Q, "");

@ -172,7 +172,7 @@ public class RemoteSearch extends Thread {
nodePeers.add(event.peers.mySeed());
}
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_REMOTE_SOLR_OFF, false)) {
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0);
final SolrQuery solrQuery = event.query.solrQuery(event.getQuery().contentdom, start == 0, event.excludeintext_image);
for (Seed s: nodePeers) {
Thread t = solrRemoteSearch(event, solrQuery, start, count, s, blacklist);
event.nodeSearchThreads.add(t);

@ -207,11 +207,12 @@ public class QueryGoal {
for (final byte[] b: blues) this.include_hashes.remove(b);
}
public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile) {
public StringBuilder collectionTextQueryString(CollectionConfiguration configuration, int rankingProfile, boolean noimages) {
final StringBuilder q = new StringBuilder(80);
// add filter to prevent that results come from failed urls
q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
if (noimages) q.append(" AND -").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif)");
// parse special requests
if (isCatchall()) return q;

@ -376,12 +376,12 @@ public final class QueryParams {
return SetTools.anymatch(wordhashes, keyhashes);
}
public SolrQuery solrQuery(ContentDomain cd, boolean getFacets) {
public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) {
if (cd == ContentDomain.IMAGE) return solrImageQuery(getFacets);
return solrTextQuery(getFacets);
return solrTextQuery(getFacets, excludeintext_image);
}
private SolrQuery solrTextQuery(boolean getFacets) {
private SolrQuery solrTextQuery(final boolean getFacets, final boolean excludeintext_image) {
if (this.cachedQuery != null) {
this.cachedQuery.setStart(this.offset);
return this.cachedQuery;
@ -391,7 +391,7 @@ public final class QueryParams {
// construct query
final SolrQuery params = getBasicParams(getFacets);
int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0;
params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile).toString());
params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile, excludeintext_image).toString());
Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile
String bq = ranking.getBoostQuery();
@ -399,36 +399,6 @@ public final class QueryParams {
if (bq.length() > 0) params.setParam("bq", bq);
if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29
/*
if (this.contentdom == ContentDomain.IMAGE) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\")");
}
if (this.contentdom == ContentDomain.AUDIO) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aif\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"aiff\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp3\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"ogg\")");
}
if (this.contentdom == ContentDomain.VIDEO) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mpg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"avi\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mp4\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"mkv\")");
}
if (this.contentdom == ContentDomain.APP) {
fq.append(" AND (").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"apk\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"exe\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"dmg\"");
fq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"gz\")");
}
*/
// prepare result
ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
this.cachedQuery = params;

@ -164,7 +164,8 @@ public final class SearchEvent {
private final WeakPriorityBlockingQueue<URIMetadataNode> nodeStack; // thats the bag where the solr results are written to
private final WeakPriorityBlockingQueue<ResultEntry> resultList; // thats the result list where the actual search result is waiting to be displayed
private final boolean pollImmediately; // if this is true, then every entry in result List is polled immediately to prevent a re-ranking in the resultList. This is usefull if there is only one index source.
public final boolean excludeintext_image;
// the following values are filled during the search process as statistics for the search
public final AtomicInteger local_rwi_available; // the number of hits generated/ranked by the local search in rwi index
public final AtomicInteger local_rwi_stored; // the number of existing hits by the local search in rwi index
@ -220,6 +221,7 @@ public final class SearchEvent {
this.nodeStack = new WeakPriorityBlockingQueue<URIMetadataNode>(100, false);
this.maxExpectedRemoteReferences = new AtomicInteger(0);
this.expectedRemoteReferences = new AtomicInteger(0);
this.excludeintext_image = Switchboard.getSwitchboard().getConfigBool("search.excludeintext.image", true);
// prepare configured search navigation
final String navcfg = Switchboard.getSwitchboard().getConfig("search.navigation", "");
this.authorNavigator = navcfg.contains("authors") ? new ConcurrentScoreMap<String>() : null;
@ -282,7 +284,7 @@ public final class SearchEvent {
// start a local solr search
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, true, this.excludeintext_image), 0, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
}
this.localsolroffset = this.query.itemsPerPage;
@ -837,6 +839,13 @@ public final class SearchEvent {
if (log.isFine()) log.fine("dropped Node: content domain does not match");
continue pollloop;
}
// filter out media links in text search, if wanted
String ext = MultiProtocolURI.getFileExtension(iEntry.url().getFileName());
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
if (log.isFine()) log.fine("dropped Node: file name domain does not match");
continue pollloop;
}
// check site constraints
final String hosthash = iEntry.hosthash();
@ -1014,7 +1023,7 @@ public final class SearchEvent {
}
// check content domain
if (((this.query.contentdom == Classification.ContentDomain.TEXT && page.url().getContentDomain() == Classification.ContentDomain.IMAGE) ||
if (this.query.contentdom.getCode() > 0 && (
(this.query.contentdom == Classification.ContentDomain.IMAGE && page.url().getContentDomain() != Classification.ContentDomain.IMAGE) ||
(this.query.contentdom == Classification.ContentDomain.AUDIO && page.url().getContentDomain() != Classification.ContentDomain.AUDIO) ||
(this.query.contentdom == Classification.ContentDomain.VIDEO && page.url().getContentDomain() != Classification.ContentDomain.VIDEO) ||
@ -1024,6 +1033,13 @@ public final class SearchEvent {
continue;
}
// filter out media links in text search, if wanted
String ext = MultiProtocolURI.getFileExtension(page.url().getFileName());
if (this.query.contentdom == ContentDomain.TEXT && Classification.isImageExtension(ext) && this.excludeintext_image) {
if (log.isFine()) log.fine("dropped RWI: file name domain does not match");
continue;
}
// Check for blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.SEARCH, page)) {
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in url blacklist");
@ -1340,7 +1356,7 @@ public final class SearchEvent {
int nextitems = item - this.localsolroffset + this.query.itemsPerPage; // example: suddenly switch to item 60, just 10 had been shown, 20 loaded.
if (this.localsolrsearch != null && this.localsolrsearch.isAlive()) {try {this.localsolrsearch.join();} catch (final InterruptedException e) {}}
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist);
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0, this.excludeintext_image), this.localsolroffset, nextitems, null /*this peer*/, Switchboard.urlBlacklist);
}
this.localsolroffset += nextitems;
}
@ -1361,7 +1377,7 @@ public final class SearchEvent {
if (this.localsolrsearch == null || !this.localsolrsearch.isAlive() && this.local_solr_stored.get() > this.localsolroffset && (item + 1) % this.query.itemsPerPage == 0) {
// at the end of a list, trigger a next solr search
if (!Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.DEBUG_SEARCH_LOCAL_SOLR_OFF, false)) {
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
this.localsolrsearch = RemoteSearch.solrRemoteSearch(this, this.query.solrQuery(this.query.contentdom, this.localsolroffset == 0, this.excludeintext_image), this.localsolroffset, this.query.itemsPerPage, null /*this peer*/, Switchboard.urlBlacklist);
}
this.localsolroffset += this.query.itemsPerPage;
}

Loading…
Cancel
Save