From 19f1308bf09172d2be66c58289d52ba2b2c0cf9d Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 30 Aug 2015 02:19:52 +0200 Subject: [PATCH 1/3] enforce th result images limit to > 16x16px for linked images http://mantis.tokeek.de/view.php?id=594 --- source/net/yacy/search/query/SearchEvent.java | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index cdcec498d..49ee55380 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1598,7 +1598,7 @@ public final class SearchEvent { Collection altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); Collection imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); if (imgO != null && imgO.size() > 0 && imgO instanceof List) { - List alt = altO == null ? new ArrayList(imgO.size()) : (List) altO; + List alt = altO == null ? null : (List) altO; List img = (List) imgO; List prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size()); Collection heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()); @@ -1608,17 +1608,21 @@ public final class SearchEvent { for (int c = 0; c < img.size(); c++) { String image_urlstub = (String) img.get(c); if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic - String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : ""; - boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)); try { + int h = height == null ? 0 : (Integer) height.get(c); + int w = width == null ? 0 : (Integer) width.get(c); + + // check size good for display (parser may init unknown dimension with -1) + if (h > 0 && h <= 16) continue; // to small for display + if (w > 0 && w <= 16) continue; // to small for display + DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub); - Integer h = height == null ? null : (Integer) height.get(c); - Integer w = width == null ? null : (Integer) width.get(c); - boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16; String id = ASCII.String(imageUrl.hash()); if (!imageViewed.containsKey(id) && !containsSpare(id)) { - ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0); - if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult); + String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : ""; + ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0); + boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)); + if (match) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult); } } catch (MalformedURLException e) { continue; From c33229fc0c2b2dc5325137171a10c9521d5ac944 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 30 Aug 2015 23:02:19 +0200 Subject: [PATCH 2/3] check mime prior to ext for metadata modification for images --- .../net/yacy/cora/document/id/MultiProtocolURL.java | 4 ++++ .../yacy/search/schema/CollectionConfiguration.java | 11 ++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 6df62a416..7cfc1543f 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -1178,6 +1178,10 @@ public class MultiProtocolURL implements Serializable, Comparable 0 && "cgi.exe".indexOf(extension.toLowerCase()) >= 0; } + /** + * @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE) + */ + @Deprecated public static final boolean isImage(final String extension) { return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE; } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e3f46b4c0..c9fb596e9 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -48,6 +48,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; @@ -537,14 +538,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri LinkedHashMap outboundLinks = document.outboundLinks(); Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); - List images = new ArrayList(); int c = 0; final Object parser = document.getParserObject(); boolean containsCanonical = false; DigestURL canonical = null; if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; - images = html.getImages(); + List images = html.getImages(); // header tags int h = 0; @@ -912,12 +912,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri !content.endsWith(" " + r)) content += " " + r; } } - - if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) { + + // handle image source meta data + if ((allAttr || contains(CollectionSchema.images_text_t)) && (document.getContentDomain() == ContentDomain.IMAGE)) { add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser content = digestURL.toTokens(); // remove all other entry but the url tokens } - + // content (must be written after special parser data, since this can influence the content) if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content); if (allAttr || contains(CollectionSchema.wordcount_i)) { From eaf0e8ff2c2595a13851f3ea1d53ca1909b3412a Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 31 Aug 2015 01:58:36 +0200 Subject: [PATCH 3/3] start recording/indexing pixel size for image document as for linked images --- .../schema/CollectionConfiguration.java | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index c9fb596e9..b9617d1f9 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -914,9 +914,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // handle image source meta data - if ((allAttr || contains(CollectionSchema.images_text_t)) && (document.getContentDomain() == ContentDomain.IMAGE)) { - add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser - content = digestURL.toTokens(); // remove all other entry but the url tokens + if (document.getContentDomain() == ContentDomain.IMAGE) { + // add image pixel size if known + Iterator imgit = document.getImages().values().iterator(); + if (imgit.hasNext()) { + ImageEntry img = imgit.next(); + int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width(); + if (imgpixels > 0) { + if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, img.height()); + if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, img.width()); + if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels); + } + } + + if (allAttr || contains(CollectionSchema.images_text_t)) { + add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser + content = digestURL.toTokens(); // remove all other entry but the url tokens + } } // content (must be written after special parser data, since this can influence the content)