stub for better image indexing

pull/1/head
orbiter 12 years ago
parent 97f2ac9091
commit 8792e6c6e9

@ -232,17 +232,26 @@ outboundlinks_protocol_sxt
## external links, the url only without the protocol
outboundlinks_urlstub_txt
## all image tags, encoded as <img> tag inclusive alt- and title property
#images_tag_sxt
## all text/words appearing in image alt texts or the tokenized url
images_text_t
## all image links without the protocol and '://'
#images_urlstub_sxt
images_urlstub_sxt
## all image link protocols
#images_protocol_sxt
images_protocol_sxt
## all image link alt tag
#images_alt_txt
images_alt_sxt
## size of images:height
images_height_val
## size of images:width
images_width_val
## size of images as number of pixels (easier for ranking than using with and height)
images_pixel_val
## number of image links with alt tag
#images_withalt_i

@ -582,27 +582,35 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// images
final Collection<ImageEntry> imagesc = images.values();
final List<String> imgtags = new ArrayList<String>(imagesc.size());
final List<String> imgprots = new ArrayList<String>(imagesc.size());
final List<String> imgstubs = new ArrayList<String>(imagesc.size());
final List<String> imgalts = new ArrayList<String>(imagesc.size());
final ArrayList<String> imgprots = new ArrayList<String>(imagesc.size());
final Integer[] imgheights = new Integer[imagesc.size()];
final Integer[] imgwidths = new Integer[imagesc.size()];
final Integer[] imgpixels = new Integer[imagesc.size()];
final String[] imgstubs = new String[imagesc.size()];
final String[] imgalts = new String[imagesc.size()];
int withalt = 0;
int i = 0;
for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
inboundLinks.remove(uri);
outboundLinks.remove(uri);
imgtags.add(ie.toString());
imgheights[i] = ie.height();
imgwidths[i] = ie.width();
imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width();
String protocol = uri.getProtocol();
imgprots.add(protocol);
imgstubs.add(uri.toString().substring(protocol.length() + 3));
imgalts.add(ie.alt());
imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt();
if (ie.alt() != null && ie.alt().length() > 0) withalt++;
i++;
}
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imgtags.size());
if (allAttr || contains(CollectionSchema.images_tag_sxt)) add(doc, CollectionSchema.images_tag_sxt, imgtags);
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imagesc.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
if (allAttr || contains(CollectionSchema.images_alt_txt)) add(doc, CollectionSchema.images_alt_txt, imgalts);
if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts);
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights);
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
// style sheets

@ -119,10 +119,13 @@ public enum CollectionSchema implements SchemaDeclaration {
outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"),
images_tag_sxt(SolrType.string, true, true, true, false, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"),
images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"),
images_alt_sxt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"),
images_height_val(SolrType.num_integer, true, true, true, false, false, "size of images:height"),
images_width_val(SolrType.num_integer, true, true, true, false, false, "size of images:width"),
images_pixel_val(SolrType.num_integer, true, true, true, false, false, "size of images as number of pixels (easier for a search restriction than with and height)"),
images_withalt_i(SolrType.num_integer, true, true, false, false, false, "number of image links with alt tag"),
htags_i(SolrType.num_integer, true, true, false, false, false, "binary pattern for the existance of h1..h6 headlines"),
canonical_s(SolrType.string, true, true, false, false, false, "url inside the canonical link element"),

Loading…
Cancel
Save