From 19f1308bf09172d2be66c58289d52ba2b2c0cf9d Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Sun, 30 Aug 2015 02:19:52 +0200
Subject: [PATCH 1/3] enforce th result images limit to > 16x16px for linked
 images http://mantis.tokeek.de/view.php?id=594

---
 source/net/yacy/search/query/SearchEvent.java | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index cdcec498d..49ee55380 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -1598,7 +1598,7 @@ public final class SearchEvent {
             Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
             Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
             if (imgO != null && imgO.size() > 0 && imgO instanceof List<?>) {
-                List<Object> alt = altO == null ? new ArrayList<Object>(imgO.size()) : (List<Object>) altO;
+                List<Object> alt = altO == null ? null : (List<Object>) altO;
                 List<Object> img = (List<Object>) imgO;
                 List<String> prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size());
                 Collection<Object> heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
@@ -1608,17 +1608,21 @@ public final class SearchEvent {
                 for (int c = 0; c < img.size(); c++) {
                     String image_urlstub =  (String) img.get(c);
                     if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic
-                    String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : "";
-                    boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
                     try {
+                        int h = height == null ? 0 : (Integer) height.get(c);
+                        int w = width == null ? 0 : (Integer) width.get(c);
+
+                        // check size good for display (parser may init unknown dimension with -1)
+                        if (h > 0 && h <= 16) continue; // to small for display
+                        if (w > 0 && w <= 16) continue; // to small for display
+                        
                         DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub);
-                        Integer h = height == null ? null : (Integer) height.get(c);
-                        Integer w = width == null ? null : (Integer) width.get(c);
-                        boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16;
                         String id = ASCII.String(imageUrl.hash());
                         if (!imageViewed.containsKey(id) && !containsSpare(id)) {
-                            ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0);
-                            if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
+                            String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : "";
+                            ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0);
+                            boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
+                            if (match) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
                         }
                     } catch (MalformedURLException e) {
                         continue;

From c33229fc0c2b2dc5325137171a10c9521d5ac944 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Sun, 30 Aug 2015 23:02:19 +0200
Subject: [PATCH 2/3] check mime prior to ext for metadata modification for
 images

---
 .../net/yacy/cora/document/id/MultiProtocolURL.java   |  4 ++++
 .../yacy/search/schema/CollectionConfiguration.java   | 11 ++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index 6df62a416..7cfc1543f 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -1178,6 +1178,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         return extension != null && extension.length() > 0 && "cgi.exe".indexOf(extension.toLowerCase()) >= 0;
     }
 
+    /**
+     * @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE)
+     */
+    @Deprecated
     public static final boolean isImage(final String extension) {
         return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
     }
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index e3f46b4c0..c9fb596e9 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -48,6 +48,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;
 
 import net.yacy.cora.document.analysis.Classification;
+import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.AnchorURL;
@@ -537,14 +538,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         LinkedHashMap<DigestURL,String> outboundLinks = document.outboundLinks();
 
         Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
-        List<ImageEntry> images = new ArrayList<ImageEntry>();
         int c = 0;
         final Object parser = document.getParserObject();
         boolean containsCanonical = false;
         DigestURL canonical = null;
         if (parser instanceof ContentScraper) {
             final ContentScraper html = (ContentScraper) parser;
-            images = html.getImages();
+            List<ImageEntry> images = html.getImages();
 
             // header tags
             int h = 0;
@@ -912,12 +912,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                     !content.endsWith(" " + r)) content += " " + r;
             }
         }
-        
-        if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
+
+        // handle image source meta data
+        if ((allAttr || contains(CollectionSchema.images_text_t)) && (document.getContentDomain() == ContentDomain.IMAGE)) {
             add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
             content = digestURL.toTokens(); // remove all other entry but the url tokens
         }
-        
+
         // content (must be written after special parser data, since this can influence the content)
         if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content);
         if (allAttr || contains(CollectionSchema.wordcount_i)) {

From eaf0e8ff2c2595a13851f3ea1d53ca1909b3412a Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Mon, 31 Aug 2015 01:58:36 +0200
Subject: [PATCH 3/3] start recording/indexing pixel size for image document as
 for linked images

---
 .../schema/CollectionConfiguration.java       | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index c9fb596e9..b9617d1f9 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -914,9 +914,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         }
 
         // handle image source meta data
-        if ((allAttr || contains(CollectionSchema.images_text_t)) && (document.getContentDomain() == ContentDomain.IMAGE)) {
-            add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
-            content = digestURL.toTokens(); // remove all other entry but the url tokens
+        if (document.getContentDomain() == ContentDomain.IMAGE) {
+            // add image pixel size if known
+            Iterator<ImageEntry> imgit = document.getImages().values().iterator();
+            if (imgit.hasNext()) {
+                ImageEntry img = imgit.next();
+                int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
+                if (imgpixels > 0) {
+                    if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, img.height());
+                    if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, img.width());
+                    if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
+                }
+            }
+
+            if (allAttr || contains(CollectionSchema.images_text_t))  {
+                add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
+                content = digestURL.toTokens(); // remove all other entry but the url tokens
+            }
         }
 
         // content (must be written after special parser data, since this can influence the content)