From 370ba9da71cb2d43efb00e65d92fe28312ae9b69 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 24 May 2015 21:48:58 +0200 Subject: [PATCH] On imageSearch prefere mime to sort out none-image documents Generalize the hack to prevent urls with just a img extension beeing returned improving http://mantis.tokeek.de/view.php?id=528 --- source/net/yacy/search/query/SearchEvent.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 0232c252a..2d426eaed 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1580,8 +1580,13 @@ public final class SearchEvent { SolrDocument doc = ms.getNode(); // there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents. String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName()); - boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that - if (!fakeImageHost && (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE)) { + + // boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that + // generalize above hack (regarding url with file extension but beeing a html (with html mime) + char docType = Response.docType(mime); // first look at mime (as some html pages have img extension (like wikipedia) + if (docType == Response.DT_UNKNOWN) docType = Response.docType(ms.url()); // try extension if mime wasn't successful + + if (docType == Response.DT_IMAGE) { String id = ASCII.String(ms.hash()); if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0)); } else {