From 1735dbc9d9b40dcf2a5b5a09eb020e9125a4eb11 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Fri, 12 Sep 2014 16:37:01 +0200
Subject: [PATCH] enhanced image search: bugfixes and performance enhancements

---
 htroot/ViewImage.java                         |  3 +-
 htroot/yacysearchitem.java                    |  7 +-
 .../net/yacy/repository/LoaderDispatcher.java | 10 +-
 source/net/yacy/search/query/SearchEvent.java | 93 +++++++++++++------
 4 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index b7fb2baba..42b40b55b 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -106,7 +106,8 @@ public class ViewImage {
         if (image == null) {
             byte[] resourceb = null;
             if (url != null) try {
-                ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
+                String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
+                ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
                 resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, agent);
             } catch (final IOException e) {
                 ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java
index d21e38d9c..53b9a778b 100644
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@@ -34,7 +34,6 @@ import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.feed.RSSMessage;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
-import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.RequestHeader.FileType;
@@ -189,7 +188,7 @@ public class yacysearchitem {
 
             String resultFileName = resultURL.getFileName();
             prop.putHTML("content_target", target);
-            if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
+            //if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
             prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
             prop.put("content_urlhash", resulthashString);
             prop.put("content_ranking", Float.toString(result.score()));
@@ -275,8 +274,8 @@ public class yacysearchitem {
                 final String imageUrlstring = image.imageUrl.toNormalform(true);
                 final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
 
-                final String license = URLLicense.aquireLicense(image.imageUrl);
-                sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
+                final String license = URLLicense.aquireLicense(image.imageUrl); // this is just the license key to get the image forwarded through the YaCy thumbnail viewer, not an actual lawful license
+                //sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
                 prop.putHTML("content_item_hrefCache", "/ViewImage.png?maxwidth=128&maxheight=128&quadratic=&url=" + imageUrlstring);
                 prop.putHTML("content_item_href", imageUrlstring);
                 prop.putHTML("content_item_target", target);
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 0520972d7..b72fd8e1e 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -149,10 +149,13 @@ public final class LoaderDispatcher {
     }
 
     public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
-        Semaphore check = this.loaderSteering.get(request.url());
+        Semaphore check = this.loaderSteering.get(request.url());        
         if (check != null) {
             // a loading process may be going on for that url
+            //ConcurrentLog.info("LoaderDispatcher", "waiting for " + request.url().toNormalform(true));
+            long t = System.currentTimeMillis();
             try { check.tryAcquire(5, TimeUnit.SECONDS);} catch (final InterruptedException e) {}
+            ConcurrentLog.info("LoaderDispatcher", "waited " + (System.currentTimeMillis() - t) + " ms for " + request.url().toNormalform(true));
             // now the process may have terminated and we run a normal loading
             // which may be successful faster because of a cache hit
         }
@@ -163,13 +166,12 @@ public final class LoaderDispatcher {
             check = this.loaderSteering.remove(request.url());
             if (check != null) check.release(1000);
             return response;
-        } catch (final IOException e) {
+        } catch (final Throwable e) {
             throw new IOException(e);
         } finally {
             // release the semaphore anyway
             check = this.loaderSteering.remove(request.url());
-            if (check != null) check.release(1000);
-            // Very noisy: ConcurrentLog.logException(e);            
+            if (check != null) check.release(1000);          
         }
     }
 
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index cf1df2ab9..0dd16d3d8 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -94,6 +94,7 @@ import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.index.Segment;
 import net.yacy.search.ranking.ReferenceOrder;
+import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.CollectionSchema;
 import net.yacy.search.snippet.ResultEntry;
 import net.yacy.search.snippet.TextSnippet;
@@ -185,8 +186,11 @@ public final class SearchEvent {
     public final AtomicInteger remote_solr_peerCount;// the number of peers which contributed to the remote search result
     
     public int getResultCount() {
-        return this.local_rwi_available.get() + this.remote_rwi_available.get() +
-               this.remote_solr_available.get() + this.local_solr_stored.get();
+        return Math.max(
+                this.local_rwi_available.get() + this.remote_rwi_available.get() +
+                this.remote_solr_available.get() + this.local_solr_stored.get(),
+                imageViewed.size() + sizeSpare()
+               );
     }
     
     protected SearchEvent(
@@ -1474,56 +1478,89 @@ public final class SearchEvent {
         return null;
     }
 
+    private int imagePageCounter = 0;
     private LinkedHashMap<String, ImageResult> imageViewed = new LinkedHashMap<String, ImageResult>();
-    private LinkedHashMap<String, ImageResult> imageSpare = new LinkedHashMap<String, ImageResult>();
+    private LinkedHashMap<String, ImageResult> imageSpareGood = new LinkedHashMap<String, ImageResult>();
+    private LinkedHashMap<String, ImageResult> imageSpareBad = new LinkedHashMap<String, ImageResult>();
     private ImageResult nthImage(int item) {
         Object o = SetTools.nth(this.imageViewed.values(), item);
         if (o == null) return null;
         return (ImageResult) o;
     }
+    private boolean hasSpare() {
+        return imageSpareGood.size() > 0 || imageSpareBad.size() > 0;
+    }
+    private boolean containsSpare(String id) {
+        return imageSpareGood.containsKey(id) || imageSpareBad.containsKey(id);
+    }
+    private int sizeSpare() {
+        return imageSpareGood.size() + imageSpareBad.size();
+    }
     private ImageResult nextSpare() {
-        Map.Entry<String, ImageResult> next = imageSpare.entrySet().iterator().next();
-        imageViewed.put(next.getKey(), next.getValue());
-        imageSpare.remove(next.getKey());
-        return next.getValue();
+        if (imageSpareGood.size() > 0) {
+            Map.Entry<String, ImageResult> next = imageSpareGood.entrySet().iterator().next();
+            imageViewed.put(next.getKey(), next.getValue());
+            imageSpareGood.remove(next.getKey());
+            return next.getValue();
+        }
+        if (imageSpareBad.size() > 0) {
+            Map.Entry<String, ImageResult> next = imageSpareBad.entrySet().iterator().next();
+            imageViewed.put(next.getKey(), next.getValue());
+            imageSpareBad.remove(next.getKey());
+            return next.getValue();
+        }
+        return null;
     }
     
     public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
         if (item < imageViewed.size()) return nthImage(item);
-        if (imageSpare.size() > 0) return nextSpare();
-        ResultEntry ms = oneResult(item, timeout);
+        if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare
+        ResultEntry ms = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare
         // check if the match was made in the url or in the image links
-        if (ms == null) throw new MalformedURLException("no image url found");
+        if (ms == null) {
+            if (hasSpare()) return nextSpare();
+            throw new MalformedURLException("no image url found");
+        }
         // try to get more
         SolrDocument doc = ms.getNode();
         // there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
         String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName());
-        if (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE) {
+        boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that
+        if (!fakeImageHost && (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE)) {
             String id = ASCII.String(ms.hash());
-            if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
+            if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
         } else {
-            Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
-            Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
-            Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
-            if (img != null && img.size() > 0) {
+            Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
+            Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
+            if (imgO != null && imgO.size() > 0 && imgO instanceof List<?>) {
+                List<Object> alt = altO == null ? new ArrayList<Object>(imgO.size()) : (List<Object>) altO;
+                List<Object> img = (List<Object>) imgO;
+                List<String> prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size());
+                Collection<Object> heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
+                Collection<Object> widthO = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName());
+                List<Object> height = heightO == null ? new ArrayList<Object>(heightO.size()) : (List<Object>) heightO;
+                List<Object> width = widthO == null ? new ArrayList<Object>(widthO.size()) : (List<Object>) widthO;
                 for (int c = 0; c < img.size(); c++) {
-                    String image_urlstub =  (String) SetTools.nth(img, c);
-                    String image_alt = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
-                    if (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)) {
-                        try {
-                            DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + image_urlstub);
-                            Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
-                            Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
-                            String id = ASCII.String(imageUrl.hash());
-                            if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", image_alt, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
-                        } catch (MalformedURLException e) {
-                            continue;
+                    String image_urlstub =  (String) img.get(c);
+                    String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : "";
+                    boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
+                    try {
+                        DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub);
+                        Integer h = (Integer) height.get(c);
+                        Integer w = (Integer) width.get(c);
+                        boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16;
+                        String id = ASCII.String(imageUrl.hash());
+                        if (!imageViewed.containsKey(id) && !containsSpare(id)) {
+                            ImageResult imageResult = new ImageResult(ms.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0);
+                            if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
                         }
+                    } catch (MalformedURLException e) {
+                        continue;
                     }
                 }
             }
         }
-        if (imageSpare.size() > 0) return nextSpare();
+        if (hasSpare()) return nextSpare();
         throw new MalformedURLException("no image url found");
     }