Merge branch 'master' of ssh://git@github.com/yacy/yacy_search_server.git

10 years ago · e0dda0c01c
parent a4509ea2ca eaf0e8ff2c
commit e0dda0c01c
3 changed files with 38 additions and 15 deletions
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -1178,6 +1178,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return extension != null && extension.length() > 0 && "cgi.exe".indexOf(extension.toLowerCase()) >= 0;
    }

+    /**
+     * @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE)
+     */
+    @Deprecated
    public static final boolean isImage(final String extension) {
        return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
    }
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -1598,7 +1598,7 @@ public final class SearchEvent {
            Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
            Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
            if (imgO != null && imgO.size() > 0 && imgO instanceof List<?>) {
-                List<Object> alt = altO == null ? new ArrayList<Object>(imgO.size()) : (List<Object>) altO;
+                List<Object> alt = altO == null ? null : (List<Object>) altO;
                List<Object> img = (List<Object>) imgO;
                List<String> prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size());
                Collection<Object> heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
@ -1608,17 +1608,21 @@ public final class SearchEvent {
                for (int c = 0; c < img.size(); c++) {
                    String image_urlstub =  (String) img.get(c);
                    if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic
-                    String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : "";
-                    boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
                    try {
+                        int h = height == null ? 0 : (Integer) height.get(c);
+                        int w = width == null ? 0 : (Integer) width.get(c);
+
+                        // check size good for display (parser may init unknown dimension with -1)
+                        if (h > 0 && h <= 16) continue; // to small for display
+                        if (w > 0 && w <= 16) continue; // to small for display
+                        
                        DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub);
-                        Integer h = height == null ? null : (Integer) height.get(c);
-                        Integer w = width == null ? null : (Integer) width.get(c);
-                        boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16;
                        String id = ASCII.String(imageUrl.hash());
                        if (!imageViewed.containsKey(id) && !containsSpare(id)) {
-                            ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0);
-                            if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
+                            String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : "";
+                            ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0);
+                            boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
+                            if (match) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
                        }
                    } catch (MalformedURLException e) {
                        continue;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -48,6 +48,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.analysis.Classification;
+import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.AnchorURL;
@ -537,14 +538,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        LinkedHashMap<DigestURL,String> outboundLinks = document.outboundLinks();

        Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
-        List<ImageEntry> images = new ArrayList<ImageEntry>();
        int c = 0;
        final Object parser = document.getParserObject();
        boolean containsCanonical = false;
        DigestURL canonical = null;
        if (parser instanceof ContentScraper) {
            final ContentScraper html = (ContentScraper) parser;
-            images = html.getImages();
+            List<ImageEntry> images = html.getImages();

            // header tags
            int h = 0;
@ -912,12 +912,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    !content.endsWith(" " + r)) content += " " + r;
            }
        }
-        
-        if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
-            add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
-            content = digestURL.toTokens(); // remove all other entry but the url tokens
+
+        // handle image source meta data
+        if (document.getContentDomain() == ContentDomain.IMAGE) {
+            // add image pixel size if known
+            Iterator<ImageEntry> imgit = document.getImages().values().iterator();
+            if (imgit.hasNext()) {
+                ImageEntry img = imgit.next();
+                int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
+                if (imgpixels > 0) {
+                    if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, img.height());
+                    if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, img.width());
+                    if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
+                }
+            }
+
+            if (allAttr || contains(CollectionSchema.images_text_t))  {
+                add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
+                content = digestURL.toTokens(); // remove all other entry but the url tokens
+            }
        }
-        
+
        // content (must be written after special parser data, since this can influence the content)
        if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content);
        if (allAttr || contains(CollectionSchema.wordcount_i)) {