diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 70b551c4c..5f38ad4b1 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -330,16 +330,29 @@ public class QueryGoal { for (final byte[] b: blues) this.include_hashes.remove(b); } + /** + * Generate a Solr filter query to receive valid urls + * + * This filters out error-urls. + * On noimages=true a filter is added to exclude links to images + * using the content_type (as well as urls with common image file extension) + * + * @param noimages true if filter for images should be included + * @return Solr filter query + */ public List collectionTextFilterQuery(boolean noimages) { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200"); - if (noimages) fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)"); + if (noimages) { + fqs.add("-" + CollectionSchema.content_type.getSolrFieldName() + ":(image/*)"); + fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)"); + } return fqs; } - + public StringBuilder collectionTextQuery() { // parse special requests @@ -348,16 +361,27 @@ public class QueryGoal { // add goal query return getGoalQuery(); } - + + /** + * Generate a Solr filter query to receive valid image results. + * + * This filters error-urls out and includes urls with mime image/* as well + * as urls with links to images. + * We use the mime (image/*) only to find images as the parser assigned the + * best mime to index documents. This applies also to parsed file systems. + * This ensures that no text urls with image-fileextension is returned + * (as some large internet sites like to use such urls) + * + * @return Solr filter query for image urls + */ public List collectionImageFilterQuery() { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200"); fqs.add( - CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM + " OR " + - CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif) OR " + - CollectionSchema.content_type.getSolrFieldName() + ":(image/*)"); + CollectionSchema.content_type.getSolrFieldName() + ":(image/*) OR " + + CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); return fqs; } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index f98d49c78..047aee7a0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -419,7 +419,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final DigestURL digestURL = document.dc_source(); boolean allAttr = this.isEmpty(); String url = addURIAttributes(doc, allAttr, digestURL); - if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); + add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check) Set processTypes = new LinkedHashSet(); String host = digestURL.getHost(); @@ -2028,9 +2028,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final SolrInputDocument doc = new SolrInputDocument(); String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL()); - - if (allAttr || configuration.contains(CollectionSchema.content_type)) configuration.add(doc, CollectionSchema.content_type, new String[]{Classification.url2mime(this.digestURL)}); - + // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check) + CollectionSchema.content_type.add(doc, new String[]{Classification.url2mime(this.digestURL)}); if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, getFailDate()); if (allAttr || configuration.contains(CollectionSchema.crawldepth_i)) configuration.add(doc, CollectionSchema.crawldepth_i, this.crawldepth);