Optimize internal imagequery focus on using content_type to select images

(in favor of url file extension)
10 years ago · a58ee49307
parent 81f53fc83a
commit a58ee49307
2 changed files with 33 additions and 10 deletions
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@ -330,16 +330,29 @@ public class QueryGoal {
        for (final byte[] b: blues) this.include_hashes.remove(b);
    }

+    /**
+     * Generate a Solr filter query to receive valid urls
+     *
+     * This filters out error-urls.
+     * On noimages=true a filter is added to exclude links to images
+     * using the content_type (as well as urls with common image file extension)
+     *
+     * @param noimages  true if filter for images should be included
+     * @return Solr filter query
+     */
    public List<String> collectionTextFilterQuery(boolean noimages) {
        final ArrayList<String> fqs = new ArrayList<>();

        // add filter to prevent that results come from failed urls
        fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200");
-        if (noimages) fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)");
+        if (noimages) {
+            fqs.add("-" + CollectionSchema.content_type.getSolrFieldName() + ":(image/*)");
+            fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)");
+        }
        
        return fqs;
    }
-    
+
    public StringBuilder collectionTextQuery() {

        // parse special requests
@ -348,16 +361,27 @@ public class QueryGoal {
        // add goal query
        return getGoalQuery();
    }
-    
+
+    /**
+     * Generate a Solr filter query to receive valid image results.
+     *
+     * This filters error-urls out and includes urls with mime image/* as well
+     * as urls with links to images.
+     * We use the mime (image/*) only to find images as the parser assigned the
+     * best mime to index documents. This applies also to parsed file systems.
+     * This ensures that no text urls with image-fileextension is returned
+     * (as some large internet sites like to use such urls)
+     *
+     * @return Solr filter query for image urls
+     */
    public List<String> collectionImageFilterQuery() {
        final ArrayList<String> fqs = new ArrayList<>();

        // add filter to prevent that results come from failed urls
        fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200");
        fqs.add(
-                CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM + " OR " +
-                CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif) OR " +
-                CollectionSchema.content_type.getSolrFieldName() + ":(image/*)");
+                CollectionSchema.content_type.getSolrFieldName() + ":(image/*) OR " +
+                CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
        return fqs;
    }
    
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -419,7 +419,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        final DigestURL digestURL = document.dc_source();
        boolean allAttr = this.isEmpty();
        String url = addURIAttributes(doc, allAttr, digestURL);
-        if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, new String[]{document.dc_format()});
+        add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check)

        Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
        String host = digestURL.getHost();
@ -2028,9 +2028,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            
            final SolrInputDocument doc = new SolrInputDocument();
            String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL());
-            
-            if (allAttr || configuration.contains(CollectionSchema.content_type)) configuration.add(doc, CollectionSchema.content_type, new String[]{Classification.url2mime(this.digestURL)});
-
+            // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check)
+            CollectionSchema.content_type.add(doc, new String[]{Classification.url2mime(this.digestURL)});
            if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, getFailDate());
            if (allAttr || configuration.contains(CollectionSchema.crawldepth_i)) configuration.add(doc, CollectionSchema.crawldepth_i, this.crawldepth);