From 1735dbc9d9b40dcf2a5b5a09eb020e9125a4eb11 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 12 Sep 2014 16:37:01 +0200 Subject: [PATCH] enhanced image search: bugfixes and performance enhancements --- htroot/ViewImage.java | 3 +- htroot/yacysearchitem.java | 7 +- .../net/yacy/repository/LoaderDispatcher.java | 10 +- source/net/yacy/search/query/SearchEvent.java | 93 +++++++++++++------ 4 files changed, 76 insertions(+), 37 deletions(-) diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index b7fb2baba..42b40b55b 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -106,7 +106,8 @@ public class ViewImage { if (image == null) { byte[] resourceb = null; if (url != null) try { - ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); + String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName); + ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName); resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, agent); } catch (final IOException e) { ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage()); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index d21e38d9c..53b9a778b 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -34,7 +34,6 @@ import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; @@ -189,7 +188,7 @@ public class yacysearchitem { String resultFileName = resultURL.getFileName(); prop.putHTML("content_target", target); - if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent); + //if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent); prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.put("content_urlhash", resulthashString); prop.put("content_ranking", Float.toString(result.score())); @@ -275,8 +274,8 @@ public class yacysearchitem { final String imageUrlstring = image.imageUrl.toNormalform(true); final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); - final String license = URLLicense.aquireLicense(image.imageUrl); - sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent); + final String license = URLLicense.aquireLicense(image.imageUrl); // this is just the license key to get the image forwarded through the YaCy thumbnail viewer, not an actual lawful license + //sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent); prop.putHTML("content_item_hrefCache", "/ViewImage.png?maxwidth=128&maxheight=128&quadratic=&url=" + imageUrlstring); prop.putHTML("content_item_href", imageUrlstring); prop.putHTML("content_item_target", target); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 0520972d7..b72fd8e1e 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -149,10 +149,13 @@ public final class LoaderDispatcher { } public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { - Semaphore check = this.loaderSteering.get(request.url()); + Semaphore check = this.loaderSteering.get(request.url()); if (check != null) { // a loading process may be going on for that url + //ConcurrentLog.info("LoaderDispatcher", "waiting for " + request.url().toNormalform(true)); + long t = System.currentTimeMillis(); try { check.tryAcquire(5, TimeUnit.SECONDS);} catch (final InterruptedException e) {} + ConcurrentLog.info("LoaderDispatcher", "waited " + (System.currentTimeMillis() - t) + " ms for " + request.url().toNormalform(true)); // now the process may have terminated and we run a normal loading // which may be successful faster because of a cache hit } @@ -163,13 +166,12 @@ public final class LoaderDispatcher { check = this.loaderSteering.remove(request.url()); if (check != null) check.release(1000); return response; - } catch (final IOException e) { + } catch (final Throwable e) { throw new IOException(e); } finally { // release the semaphore anyway check = this.loaderSteering.remove(request.url()); - if (check != null) check.release(1000); - // Very noisy: ConcurrentLog.logException(e); + if (check != null) check.release(1000); } } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index cf1df2ab9..0dd16d3d8 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -94,6 +94,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; import net.yacy.search.ranking.ReferenceOrder; +import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import net.yacy.search.snippet.ResultEntry; import net.yacy.search.snippet.TextSnippet; @@ -185,8 +186,11 @@ public final class SearchEvent { public final AtomicInteger remote_solr_peerCount;// the number of peers which contributed to the remote search result public int getResultCount() { - return this.local_rwi_available.get() + this.remote_rwi_available.get() + - this.remote_solr_available.get() + this.local_solr_stored.get(); + return Math.max( + this.local_rwi_available.get() + this.remote_rwi_available.get() + + this.remote_solr_available.get() + this.local_solr_stored.get(), + imageViewed.size() + sizeSpare() + ); } protected SearchEvent( @@ -1474,56 +1478,89 @@ public final class SearchEvent { return null; } + private int imagePageCounter = 0; private LinkedHashMap imageViewed = new LinkedHashMap(); - private LinkedHashMap imageSpare = new LinkedHashMap(); + private LinkedHashMap imageSpareGood = new LinkedHashMap(); + private LinkedHashMap imageSpareBad = new LinkedHashMap(); private ImageResult nthImage(int item) { Object o = SetTools.nth(this.imageViewed.values(), item); if (o == null) return null; return (ImageResult) o; } + private boolean hasSpare() { + return imageSpareGood.size() > 0 || imageSpareBad.size() > 0; + } + private boolean containsSpare(String id) { + return imageSpareGood.containsKey(id) || imageSpareBad.containsKey(id); + } + private int sizeSpare() { + return imageSpareGood.size() + imageSpareBad.size(); + } private ImageResult nextSpare() { - Map.Entry next = imageSpare.entrySet().iterator().next(); - imageViewed.put(next.getKey(), next.getValue()); - imageSpare.remove(next.getKey()); - return next.getValue(); + if (imageSpareGood.size() > 0) { + Map.Entry next = imageSpareGood.entrySet().iterator().next(); + imageViewed.put(next.getKey(), next.getValue()); + imageSpareGood.remove(next.getKey()); + return next.getValue(); + } + if (imageSpareBad.size() > 0) { + Map.Entry next = imageSpareBad.entrySet().iterator().next(); + imageViewed.put(next.getKey(), next.getValue()); + imageSpareBad.remove(next.getKey()); + return next.getValue(); + } + return null; } public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException { if (item < imageViewed.size()) return nthImage(item); - if (imageSpare.size() > 0) return nextSpare(); - ResultEntry ms = oneResult(item, timeout); + if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare + ResultEntry ms = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare // check if the match was made in the url or in the image links - if (ms == null) throw new MalformedURLException("no image url found"); + if (ms == null) { + if (hasSpare()) return nextSpare(); + throw new MalformedURLException("no image url found"); + } // try to get more SolrDocument doc = ms.getNode(); // there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents. String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName()); - if (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE) { + boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that + if (!fakeImageHost && (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE)) { String id = ASCII.String(ms.hash()); - if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0)); + if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0)); } else { - Collection alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); - Collection img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); - Collection prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()); - if (img != null && img.size() > 0) { + Collection altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); + Collection imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); + if (imgO != null && imgO.size() > 0 && imgO instanceof List) { + List alt = altO == null ? new ArrayList(imgO.size()) : (List) altO; + List img = (List) imgO; + List prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size()); + Collection heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()); + Collection widthO = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()); + List height = heightO == null ? new ArrayList(heightO.size()) : (List) heightO; + List width = widthO == null ? new ArrayList(widthO.size()) : (List) widthO; for (int c = 0; c < img.size(); c++) { - String image_urlstub = (String) SetTools.nth(img, c); - String image_alt = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : ""; - if (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)) { - try { - DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + image_urlstub); - Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c); - Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c); - String id = ASCII.String(imageUrl.hash()); - if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", image_alt, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0)); - } catch (MalformedURLException e) { - continue; + String image_urlstub = (String) img.get(c); + String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : ""; + boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)); + try { + DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub); + Integer h = (Integer) height.get(c); + Integer w = (Integer) width.get(c); + boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16; + String id = ASCII.String(imageUrl.hash()); + if (!imageViewed.containsKey(id) && !containsSpare(id)) { + ImageResult imageResult = new ImageResult(ms.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0); + if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult); } + } catch (MalformedURLException e) { + continue; } } } } - if (imageSpare.size() > 0) return nextSpare(); + if (hasSpare()) return nextSpare(); throw new MalformedURLException("no image url found"); }