From 8792e6c6e97e4ca4f70beb1a6210045acd422437 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 18 Jun 2013 13:28:30 +0200 Subject: [PATCH] stub for better image indexing --- defaults/solr.collection.schema | 19 +++++++++---- .../schema/CollectionConfiguration.java | 28 ++++++++++++------- .../yacy/search/schema/CollectionSchema.java | 7 +++-- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index cb69a70ac..4c10cc5b8 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -232,17 +232,26 @@ outboundlinks_protocol_sxt ## external links, the url only without the protocol outboundlinks_urlstub_txt -## all image tags, encoded as tag inclusive alt- and title property -#images_tag_sxt +## all text/words appearing in image alt texts or the tokenized url +images_text_t ## all image links without the protocol and '://' -#images_urlstub_sxt +images_urlstub_sxt ## all image link protocols -#images_protocol_sxt +images_protocol_sxt ## all image link alt tag -#images_alt_txt +images_alt_sxt + +## size of images:height +images_height_val + +## size of images:width +images_width_val + +## size of images as number of pixels (easier for ranking than using with and height) +images_pixel_val ## number of image links with alt tag #images_withalt_i diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 5ae5d4645..ba5c10fb8 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -582,27 +582,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // images final Collection imagesc = images.values(); - final List imgtags = new ArrayList(imagesc.size()); - final List imgprots = new ArrayList(imagesc.size()); - final List imgstubs = new ArrayList(imagesc.size()); - final List imgalts = new ArrayList(imagesc.size()); + final ArrayList imgprots = new ArrayList(imagesc.size()); + final Integer[] imgheights = new Integer[imagesc.size()]; + final Integer[] imgwidths = new Integer[imagesc.size()]; + final Integer[] imgpixels = new Integer[imagesc.size()]; + final String[] imgstubs = new String[imagesc.size()]; + final String[] imgalts = new String[imagesc.size()]; int withalt = 0; + int i = 0; for (final ImageEntry ie: imagesc) { final MultiProtocolURI uri = ie.url(); inboundLinks.remove(uri); outboundLinks.remove(uri); - imgtags.add(ie.toString()); + imgheights[i] = ie.height(); + imgwidths[i] = ie.width(); + imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width(); String protocol = uri.getProtocol(); imgprots.add(protocol); - imgstubs.add(uri.toString().substring(protocol.length() + 3)); - imgalts.add(ie.alt()); + imgstubs[i] = uri.toString().substring(protocol.length() + 3); + imgalts[i] = ie.alt(); if (ie.alt() != null && ie.alt().length() > 0) withalt++; + i++; } - if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imgtags.size()); - if (allAttr || contains(CollectionSchema.images_tag_sxt)) add(doc, CollectionSchema.images_tag_sxt, imgtags); + if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imagesc.size()); if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); - if (allAttr || contains(CollectionSchema.images_alt_txt)) add(doc, CollectionSchema.images_alt_txt, imgalts); + if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts); + if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights); + if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths); + if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels); if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt); // style sheets diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 07ba7877e..1e49c8148 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -119,10 +119,13 @@ public enum CollectionSchema implements SchemaDeclaration { outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"), outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"), - images_tag_sxt(SolrType.string, true, true, true, false, true, " all image tags, encoded as tag inclusive alt- and title property"), + images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"), images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"), images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols"), - images_alt_txt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"), + images_alt_sxt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"), + images_height_val(SolrType.num_integer, true, true, true, false, false, "size of images:height"), + images_width_val(SolrType.num_integer, true, true, true, false, false, "size of images:width"), + images_pixel_val(SolrType.num_integer, true, true, true, false, false, "size of images as number of pixels (easier for a search restriction than with and height)"), images_withalt_i(SolrType.num_integer, true, true, false, false, false, "number of image links with alt tag"), htags_i(SolrType.num_integer, true, true, false, false, false, "binary pattern for the existance of h1..h6 headlines"), canonical_s(SolrType.string, true, true, false, false, false, "url inside the canonical link element"),