From 3cc5619d93394c85484e3f6d43a526f14c9aca0e Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 2 Feb 2016 09:57:54 +0100 Subject: [PATCH] Improved HTML icons indexing and rendering in search results. See http://mantis.tokeek.de/view.php?id=629 --- defaults/solr.collection.schema | 12 ++ htroot/api/getpageinfo.java | 17 +- htroot/api/getpageinfo_p.java | 17 +- htroot/yacysearchitem.java | 73 ++++++- source/net/yacy/document/Document.java | 45 ++-- .../document/parser/html/ContentScraper.java | 109 ++++++++-- .../yacy/document/parser/html/IconEntry.java | 198 ++++++++++++++++++ .../parser/html/IconLinkRelations.java | 92 ++++++++ .../net/yacy/document/parser/htmlParser.java | 2 +- .../kelondro/data/meta/URIMetadataNode.java | 84 +++++++- .../schema/CollectionConfiguration.java | 182 +++++++++++----- .../yacy/search/schema/CollectionSchema.java | 6 + .../parser/html/ContentScraperTest.java | 125 +++++++++++ .../document/parser/html/IconEntryTest.java | 192 +++++++++++++++++ .../data/meta/URIMetadataNodeTest.java | 157 ++++++++++++++ test/java/yacysearchitemTest.java | 187 +++++++++++++++++ 16 files changed, 1398 insertions(+), 100 deletions(-) create mode 100644 source/net/yacy/document/parser/html/IconEntry.java create mode 100644 source/net/yacy/document/parser/html/IconLinkRelations.java create mode 100644 test/java/net/yacy/document/parser/html/ContentScraperTest.java create mode 100644 test/java/net/yacy/document/parser/html/IconEntryTest.java create mode 100644 test/java/net/yacy/kelondro/data/meta/URIMetadataNodeTest.java create mode 100644 test/java/yacysearchitemTest.java diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 71ab7c6cb..48fff9136 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -278,6 +278,18 @@ outboundlinks_urlstub_sxt ## external links, the visible anchor text outboundlinks_anchortext_txt +## all icon links without the protocol and '://' +icons_urlstub_sxt + +## all icon links protocols : split from icons_urlstub to provide some compression, as http protocol is implied as default and not stored +icons_protocol_sxt + +## all icon links relationships space separated (e.g. 'icon apple-touch-icon') +icons_rel_sxt + +## all icon sizes space separated (e.g. '16x16 32x32') +icons_sizes_sxt + ## all text/words appearing in image alt texts or the tokenized url images_text_t diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index 4d999ec2a..ca926022e 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -35,6 +35,11 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -47,11 +52,6 @@ import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - public class getpageinfo { @@ -110,8 +110,13 @@ public class getpageinfo { // put the document title prop.putXML("title", removelinebreaks(scraper.dc_title())); + DigestURL favicon = null; + if (scraper.getIcons() != null && !scraper.getIcons().isEmpty()) { + favicon = scraper.getIcons().keySet().iterator().next(); + } + // put the favicon that belongs to the document - prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); + prop.put("favicon", (favicon == null) ? "" : favicon.toString()); // put keywords final Set list = scraper.dc_subject(); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index f5269ad32..15bf2ae87 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -35,6 +35,11 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -47,11 +52,6 @@ import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - public class getpageinfo_p { @@ -109,9 +109,14 @@ public class getpageinfo_p { if (scraper != null) { // put the document title prop.putXML("title", scraper.dc_title()); + + DigestURL favicon = null; + if (scraper.getIcons() != null && !scraper.getIcons().isEmpty()) { + favicon = scraper.getIcons().keySet().iterator().next(); + } // put the favicon that belongs to the document - prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); + prop.put("favicon", (favicon == null) ? "" : favicon.toString()); // put keywords final Set list = scraper.dc_subject(); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 7d1d1295a..277f68852 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -24,6 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.awt.Dimension; import java.io.File; import java.net.MalformedURLException; import java.util.Collection; @@ -48,6 +49,7 @@ import net.yacy.crawler.data.Transactions; import net.yacy.crawler.data.Transactions.State; import net.yacy.crawler.retrieval.Response; import net.yacy.data.URLLicense; +import net.yacy.document.parser.html.IconEntry; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.util.Formatter; import net.yacy.peers.NewsPool; @@ -127,14 +129,6 @@ public class yacysearchitem { final DigestURL resultURL = result.url(); final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); - final int port = resultURL.getPort(); - DigestURL faviconURL = null; - if ((fileType == FileType.HTML || fileType == FileType.JSON) && !sb.isIntranetMode()) try { - faviconURL = new DigestURL(resultURL.getProtocol() + "://" + resultURL.getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico"); - } catch (final MalformedURLException e1) { - ConcurrentLog.logException(e1); - faviconURL = null; - } final String resource = theSearch.query.domType.toString(); final String origQ = theSearch.query.getQueryGoal().getQueryString(true); prop.put("content", 1); // switch on specific content @@ -194,6 +188,7 @@ public class yacysearchitem { boolean isAtomFeed = header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("atom"); String resultFileName = resultURL.getFileName(); prop.putHTML("content_target", target); + DigestURL faviconURL = getFaviconURL(sb.isIntranetMode(), fileType, result, new Dimension(16, 16)); prop.putHTML("content_faviconUrl", processFaviconURL(authenticated, faviconURL)); prop.put("content_urlhash", urlhash); prop.put("content_ranking", Float.toString(result.score())); @@ -342,6 +337,68 @@ public class yacysearchitem { return prop; } + /** + * Tries to retrieve favicon url from solr result document, or generates + * default favicon URL (i.e. "http://host/favicon.ico") from resultURL and + * port. + * + * @param isIntranetMode + * when true returns null + * @param fileType + * file type result as specified in request header + * @param result + * solr document result. Must not be null. + * @param preferredSize preferred icon size. If no one matches, most close icon is returned. + * @return favicon URL or null when even default favicon URL can not be generated + * @throws NullPointerException when one requested parameter is null + */ + protected static DigestURL getFaviconURL(final boolean isIntranetMode, final RequestHeader.FileType fileType, + final URIMetadataNode result, Dimension preferredSize) { + DigestURL faviconURL = null; + if ((fileType == FileType.HTML || fileType == FileType.JSON) && !isIntranetMode) { + try { + String defaultFaviconURL = result.url().getProtocol() + "://" + result.url().getHost() + + ((result.url().getPort() != -1) ? (":" + result.url().getPort()) : "") + "/favicon.ico"; + IconEntry faviconEntry = null; + + /* We look preferably for a standard icon with preferred size, but accept as a fallback other icons below 128x128 or with no known size*/ + boolean foundStandard = false; + double closestDistance = Double.MAX_VALUE; + for(IconEntry icon : result.getIcons()) { + boolean isStandard = icon.isStandardIcon(); + double distance = IconEntry.getDistance(icon.getClosestSize(preferredSize), preferredSize); + boolean match = false; + if(foundStandard) { + /* Already found a standard icon : now must find a standard icon with closer size */ + match = isStandard && distance < closestDistance; + } else { + /* No standard icon yet found : prefer a standard icon, or check size */ + match = isStandard || distance < closestDistance; + } + if(match) { + faviconEntry = icon; + closestDistance = distance; + foundStandard = isStandard; + if(isStandard && distance == 0.0) { + break; + } + } + } + + if (faviconEntry == null) { + faviconURL = new DigestURL(defaultFaviconURL); + } else { + faviconURL = faviconEntry.getUrl(); + } + + } catch (final MalformedURLException e1) { + ConcurrentLog.logException(e1); + faviconURL = null; + } + } + return faviconURL; + } + /** * @param authenticated * true when current user is authenticated diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index a594878e2..128008611 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -32,7 +32,6 @@ import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; -import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; @@ -62,6 +61,7 @@ import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.IconEntry; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.util.FileUtils; @@ -87,7 +87,8 @@ public class Document { private LinkedHashMap audiolinks, videolinks, applinks, hyperlinks; // TODO: check if redundant value (set to key.getNameProperty()) is needed private LinkedHashMap inboundlinks, outboundlinks; private Set emaillinks; // mailto: links - private MultiProtocolURL favicon; + /** links to icons that belongs to the document (mapped by absolute URL) */ + private Map icons; private boolean resorted; private final Set languages; private boolean indexingDenied; @@ -139,6 +140,7 @@ public class Document { this.videolinks = null; this.applinks = null; this.emaillinks = null; + this.icons = new HashMap<>(); this.resorted = false; this.inboundlinks = null; this.outboundlinks = null; @@ -576,6 +578,7 @@ dc_rights // that can be calculated from given hyperlinks and imagelinks this.hyperlinks.putAll(allReflinks(this.images.values())); + this.hyperlinks.putAll(allReflinks(this.icons.keySet())); this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet())); this.hyperlinks.putAll(allReflinks(this.videolinks.keySet())); this.hyperlinks.putAll(allReflinks(this.applinks.keySet())); @@ -658,6 +661,8 @@ dc_rights url = new AnchorURL((String) o); else if (o instanceof ImageEntry) url = new AnchorURL(((ImageEntry) o).url()); + else if (o instanceof IconEntry) + url = new AnchorURL(((IconEntry) o).getUrl()); else { assert false; continue loop; @@ -718,20 +723,26 @@ dc_rights this.images.putAll(doc.getImages()); } } - + /** - * @return the {@link URL} to the favicon that belongs to the document + * @return links to icons that belongs to the document (mapped by absolute URL) */ - public MultiProtocolURL getFavicon() { - return this.favicon; - } - + public Map getIcons() { + return icons; + } + /** - * @param faviconURL the {@link URL} to the favicon that belongs to the document + * Set links to icons that belongs to the document (mapped by absolute URL) + * @param icons */ - public void setFavicon(final MultiProtocolURL faviconURL) { - this.favicon = faviconURL; - } + public void setIcons(Map icons) { + /* Better to ensure now icons property will not be null */ + if(icons != null) { + this.icons = icons; + } else { + this.icons = new HashMap<>(); + } + } public int inboundLinkNofollowCount() { if (this.inboundlinks == null) resortLinks(); @@ -836,9 +847,13 @@ dc_rights } /** - * merge documents: a helper method for all parsers that return multiple documents - * @param docs - * @return + * merge documents: a helper method for all parsers that return multiple documents. + * Note : when docs contains more than one item, eventual icons in each docs are not merged in result doc, + * as their scope is limited to only one document. + * @param location url of merged document + * @param globalMime Mime type of merged document + * @param docs documents to merge + * @return document resulting of merge, or original document when docs contains only one item. */ public static Document mergeDocuments(final DigestURL location, final String globalMime, final Document[] docs) { if (docs == null || docs.length == 0) return null; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 4daa9e8ce..09d0bcbd1 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -20,6 +20,7 @@ package net.yacy.document.parser.html; +import java.awt.Dimension; import java.io.ByteArrayInputStream; import java.io.CharArrayReader; import java.io.File; @@ -31,12 +32,16 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Set; @@ -52,6 +57,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.storage.SizeLimitedMap; import net.yacy.cora.storage.SizeLimitedSet; +import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; @@ -66,7 +72,7 @@ import net.yacy.kelondro.util.ISO639; public class ContentScraper extends AbstractScraper implements Scraper { private final static int MAX_TAGSIZE = 1024 * 1024; - public static final int MAX_DOCSIZE = 40 * 1024 * 1024; + public static final int MAX_DOCSIZE = 40 * 1024 * 1024; private final char degree = '\u00B0'; private final char[] minuteCharsHTML = "'".toCharArray(); @@ -194,10 +200,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { private int breadcrumbs; - /** - * {@link MultiProtocolURL} to the favicon that belongs to the document - */ - private MultiProtocolURL favicon; + /** links to icons that belongs to the document (mapped by absolute URL)*/ + private final Map icons; /** * The document root {@link MultiProtocolURL} @@ -230,6 +234,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.css = new SizeLimitedMap(maxLinks); this.anchors = new ArrayList(); this.images = new ArrayList(); + this.icons = new HashMap<>(); this.embeds = new SizeLimitedMap(maxLinks); this.frames = new SizeLimitedSet(maxLinks); this.iframes = new SizeLimitedSet(maxLinks); @@ -405,6 +410,69 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } } + + /** + * Parses sizes icon link attribute. (see + * http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual + * duplicates are removed. + * + * @param sizesAttr + * sizes attribute string, may be null + * @return a set of sizes eventually empty. + */ + public static Set parseSizes(String sizesAttr) { + Set sizes = new HashSet(); + Set tokens = parseSpaceSeparatedTokens(sizesAttr); + for (String token : tokens) { + /* + * "any" keyword may be present, but doesn't have to produce a + * dimension result + */ + if (token != null) { + Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token); + if (matcher.matches()) { + /* With given pattern no NumberFormatException can occur */ + sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2)))); + } + } + } + return sizes; + } + + /** + * Parses a space separated tokens attribute value (see + * http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens). + * Eventual duplicates are removed. + * + * @param attr + * attribute string, may be null + * @return a set of tokens eventually empty + */ + public static Set parseSpaceSeparatedTokens(String attr) { + Set tokens = new HashSet<>(); + /* Check attr string is not empty to avoid adding a single empty string + * in result */ + if (attr != null && !attr.trim().isEmpty()) { + String[] items = attr.trim().split(CommonPattern.SPACES.pattern()); + Collections.addAll(tokens, items); + } + return tokens; + } + + /** + * Retain only icon relations (standard and non standard) from tokens . + * @param relTokens relationship tokens (parsed from a rel attribute) + * @return a Set of icon relations, eventually empty + */ + public Set retainIconRelations(Collection relTokens) { + HashSet iconRels = new HashSet<>(); + for(String token : relTokens) { + if(IconLinkRelations.isIconRel(token)) { + iconRels.add(token.toLowerCase(Locale.ENGLISH)); + } + } + return iconRels; + } @Override public void scrapeTag0(Tag tag) { @@ -473,14 +541,28 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (newLink != null) { tag.opts.put("href", newLink.toNormalform(true)); String rel = tag.opts.getProperty("rel", EMPTY_STRING); + /* Rel attribute is supposed to be a set of space-separated tokens */ + Set relTokens = parseSpaceSeparatedTokens(rel); + final String linktitle = tag.opts.getProperty("title", EMPTY_STRING); final String type = tag.opts.getProperty("type", EMPTY_STRING); final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING); - if (rel.equalsIgnoreCase("shortcut icon") || rel.equalsIgnoreCase("icon")) { // html5 -> rel="icon") - final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); - this.images.add(ie); - this.favicon = newLink; + Set iconRels = retainIconRelations(relTokens); + /* Distinguish icons from images. It will enable for example to later search only images and no icons */ + if (!iconRels.isEmpty()) { + String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING); + Set sizes = parseSizes(sizesAttr); + IconEntry icon = this.icons.get(newLink); + /* There is already an icon with same URL for this document : + * they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */ + if(icon != null) { + icon.getRel().addAll(iconRels); + icon.getSizes().addAll(sizes); + } else { + icon = new IconEntry(newLink, iconRels, sizes); + this.icons.put(newLink, icon); + } } else if (rel.equalsIgnoreCase("canonical")) { tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); newLink.setAll(tag.opts); @@ -879,10 +961,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { } /** - * @return the {@link MultiProtocolURL} to the favicon that belongs to the document + * @return all icons links */ - public MultiProtocolURL getFavicon() { - return this.favicon; + public Map getIcons() { + return this.icons; } /* @@ -939,7 +1021,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final static Pattern commaSepPattern = Pattern.compile(" |,"); private final static Pattern semicSepPattern = Pattern.compile(" |;"); - + public Set getContentLanguages() { // i.e. // or @@ -1096,6 +1178,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.iframes.clear(); this.embeds.clear(); this.images.clear(); + this.icons.clear(); this.metas.clear(); this.hreflang.clear(); this.navigation.clear(); diff --git a/source/net/yacy/document/parser/html/IconEntry.java b/source/net/yacy/document/parser/html/IconEntry.java new file mode 100644 index 000000000..55adce240 --- /dev/null +++ b/source/net/yacy/document/parser/html/IconEntry.java @@ -0,0 +1,198 @@ +/** + * IconEntry + * Copyright 2011 by Michael Peter Christen + * First released 28.04.2011 at http://yacy.net +* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.parser.html; + +import java.awt.Dimension; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +import net.yacy.cora.document.id.DigestURL; + +/** + * Represents an icon in a document. + * + * @author luc + * + */ +public class IconEntry { + + /** Patern to parse a HTML link sizes token attribute (ie. "16x16") */ + public static final Pattern SIZE_PATTERN = Pattern.compile("([1-9][0-9]*)[xX]([1-9][0-9]*)"); + + /** Icon URL */ + private final DigestURL url; + /** + * Icon links relations (one url may be used as multiple icon relations in + * the same document) + */ + private final Set rel; + /** Icon sizes */ + private final Set sizes; + + /** + * Constructs instance from parameters. + * + * @param url + * must not be null. + * @param rel + * must not be null and contain at least one item. + * @param sizes + * optional. + */ + public IconEntry(final DigestURL url, Set rel, Set sizes) { + if (url == null) { + throw new IllegalArgumentException("url must not be null."); + } + if (rel == null || rel.isEmpty()) { + throw new IllegalArgumentException("rel must be specified"); + } + this.url = url; + this.rel = rel; + if (sizes != null) { + this.sizes = sizes; + } else { + this.sizes = new HashSet<>(); + } + } + + /** + * @return true when rel property contains a standard IANA registered icon + * link relation + */ + public boolean isStandardIcon() { + boolean standard = false; + for (String relation : this.rel) { + if (IconLinkRelations.isStandardIconRel(relation)) { + standard = true; + break; + } + } + return standard; + } + + /** + * @param size1 + * @param size2 + * @return distance between two sizes, or Double.MAX_VALUE when one size is null + */ + public static double getDistance(Dimension size1, Dimension size2) { + double result = Double.MAX_VALUE; + if(size1 != null && size2 != null) { + result = (Math.abs(size1.width - size2.width) + Math.abs(size1.height - size2.height)) / 2.0; + } + return result; + } + + /** + * @param preferredSize + * @return the size among sizes property which is the closest to + * preferredSize, or null when sizes is empty or preferredSize is null. + */ + public Dimension getClosestSize(Dimension preferredSize) { + Dimension closest = null; + if (preferredSize != null) { + double closestDistance = Double.MAX_VALUE; + for (Dimension size : this.sizes) { + double currentDistance = getDistance(size, preferredSize); + if (closest == null) { + closest = size; + closestDistance = currentDistance; + } else { + if (currentDistance < closestDistance) { + closest = size; + closestDistance = currentDistance; + } + } + } + } + return closest; + } + + @Override + public String toString() { + StringBuilder res = new StringBuilder(); + res.append(""); + return res.toString(); + } + + /** + * @return icon URL + */ + public DigestURL getUrl() { + return url; + } + + /** + * @return icons link relations + */ + public Set getRel() { + return rel; + } + + /** + * @return icon eventual sizes + */ + public Set getSizes() { + return sizes; + } + + /** + * @return a string representation of sizes property, in the form of a valid + * HTML link tag sizes attribute (e.g. "16x16 64x64") + */ + public String sizesToString() { + StringBuilder builder = new StringBuilder(); + for (Dimension size : this.sizes) { + if (builder.length() > 0) { + builder.append(" "); + } + builder.append(size.width).append("x").append(size.height); + } + return builder.toString(); + } + + /** + * @return a string representation of rel property, int the form of a valid + * HTML link tag rel attribute (e.g. "icon apple-touch-icon") + */ + public String relToString() { + StringBuilder builder = new StringBuilder(); + for (String relation : this.rel) { + if (builder.length() > 0) { + builder.append(" "); + } + builder.append(relation); + } + return builder.toString(); + } + +} diff --git a/source/net/yacy/document/parser/html/IconLinkRelations.java b/source/net/yacy/document/parser/html/IconLinkRelations.java new file mode 100644 index 000000000..d939cfa6a --- /dev/null +++ b/source/net/yacy/document/parser/html/IconLinkRelations.java @@ -0,0 +1,92 @@ +/** + * IconLinkRelations + * Copyright 2011 by Michael Peter Christen + * First released 28.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08 Mrz 2011) $ + * $LastChangedRevision: 7567 $ + * $LastChangedBy: low012 $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.parser.html; + +/** + * Enumeration of HTML link relationships (rel attribute) designating icon. + * @author luc + * + */ +public enum IconLinkRelations { + /** Standard IANA registered icon link relation (see https://www.iana.org/assignments/link-relations/link-relations.xhtml) */ + ICON("icon", "Standard favicon"), + /** Icon for IOS app shortcut */ + APPLE_TOUCH_ICON("apple-touch-icon", "IOS app shortcut icon"), + /** Icon for IOS app shortcut (deprecated but still used by major websites in 2015) */ + APPLE_TOUCH_ICON_PRECOMPOSED("apple-touch-icon-precomposed", "Deprecated IOS app shortcut icon"), + /** icon for Safari pinned tab */ + MASK_ICON("mask-icon", "Safari browser pinned tab icon"), + /** Icon for Fluid web app */ + FLUID_ICON("fluid-icon", "Fluid app icon"); + + /** HTML rel attribute value */ + private String relValue; + + /** Human readable description */ + private String description; + + private IconLinkRelations(String relValue, String description) { + this.relValue = relValue; + this.description = description; + } + + /** + * @return HTML rel attribute value + */ + public String getRelValue() { + return relValue; + } + + /** + * @return Human readable description of icon rel attribute + */ + public String getDescription() { + return description; + } + + /** + * @param relToken HTML rel attribute token + * @return true when relToken is an icon relationship (standard or non-standard) + */ + public static boolean isIconRel(String relToken) { + boolean res = false; + for(IconLinkRelations iconRel : IconLinkRelations.values()) { + if(iconRel.getRelValue().equalsIgnoreCase(relToken)) { + res = true; + break; + } + } + return res; + } + + /** + * @param relToken HTML rel attribute token + * @return true when relToken is Standard IANA registered icon link relation + */ + public static boolean isStandardIconRel(String relToken) { + return ICON.getRelValue().equalsIgnoreCase(relToken); + } + +} diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 4fd1dc24c..ccbebf442 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -166,7 +166,7 @@ public class htmlParser extends AbstractParser implements Parser { noDoubleImages, scraper.indexingDenied(), scraper.getDate()); - ppd.setFavicon(scraper.getFavicon()); + ppd.setIcons(scraper.getIcons()); return ppd; } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index efc5cdb4b..b97429d19 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -22,18 +22,24 @@ package net.yacy.kelondro.data.meta; +import java.awt.Dimension; import java.io.IOException; import java.net.MalformedURLException; import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Date; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Properties; +import java.util.Set; import java.util.regex.Pattern; +import org.apache.solr.common.SolrDocument; + import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.analysis.Classification; @@ -49,6 +55,8 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.document.SentenceReader; import net.yacy.document.Tokenizer; import net.yacy.document.parser.pdfParser; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.IconEntry; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -64,8 +72,6 @@ import net.yacy.search.schema.CollectionSchema; import net.yacy.search.snippet.TextSnippet; import net.yacy.utils.crypt; -import org.apache.solr.common.SolrDocument; - /** * This is the URIMetadata object implementation for Solr documents. @@ -506,7 +512,79 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable getIcons() { + Collection icons = new ArrayList<>(); + List iconsUrlStubsList = getFieldValuesAsList(CollectionSchema.icons_urlstub_sxt.getSolrFieldName()); + if (iconsUrlStubsList != null) { + + List ports = CollectionConfiguration.indexedList2protocolList( + getFieldValues(CollectionSchema.icons_protocol_sxt.getSolrFieldName()), iconsUrlStubsList.size()); + List allSizes = getFieldValuesAsList(CollectionSchema.icons_sizes_sxt.getSolrFieldName()); + List allRels = getFieldValuesAsList(CollectionSchema.icons_rel_sxt.getSolrFieldName()); + + Object item; + for (int index = 0; index < iconsUrlStubsList.size(); index++) { + item = iconsUrlStubsList.get(index); + String urlStub = null; + if (item instanceof String) { + urlStub = (String) item; + String iconURLStr = (ports != null && ports.size() > index ? ports.get(index) : "http") + "://" + urlStub; + + DigestURL iconURL; + try { + iconURL = new DigestURL(iconURLStr); + } catch (MalformedURLException e) { + continue; + } + + Set rels = null; + if (allRels.size() > index) { + item = allRels.get(index); + if (item instanceof String) { + rels = ContentScraper.parseSpaceSeparatedTokens((String) item); + } + } + /* This may happen when icons_rel_sxt field has been disabled in solr schema */ + if(rels == null) { + rels = new HashSet<>(); + rels.add("unknown"); + } + + Set sizes = null; + if (allSizes.size() > index) { + item = allSizes.get(index); + if (item instanceof String) { + sizes = ContentScraper.parseSizes((String) item); + } + } + + icons.add(new IconEntry(iconURL, rels, sizes)); + } + } + } + return icons; + } + + /** + * @param name field name + * @return field values from field name eventually immutable empty list when field has no values or is not a List + */ + public List getFieldValuesAsList(String name) { + Collection fieldValues = getFieldValues(name); + List list; + if (fieldValues instanceof List) { + list = (List) fieldValues; + } else { + list = Collections.EMPTY_LIST; + } + return list; + } + public static Date getDate(SolrDocument doc, final CollectionSchema key) { Date x = doc == null ? null : (Date) doc.getFieldValue(key.getSolrFieldName()); Date now = new Date(); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index bd2fd4716..f25288066 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -48,6 +48,13 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; +import org.eclipse.jetty.util.ConcurrentHashSet; + import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; @@ -88,6 +95,7 @@ import net.yacy.document.SentenceReader; import net.yacy.document.Tokenizer; import net.yacy.document.content.DCEntry; import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.IconEntry; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -100,13 +108,6 @@ import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.SolrInputField; -import org.eclipse.jetty.util.ConcurrentHashSet; - public class CollectionConfiguration extends SchemaConfiguration implements Serializable { @@ -543,6 +544,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final Object parser = document.getParserObject(); boolean containsCanonical = false; DigestURL canonical = null; + + processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values()); + if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; List images = html.getImages(); @@ -666,45 +670,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles); // images - final ArrayList imgprots = new ArrayList(images.size()); - final Integer[] imgheights = new Integer[images.size()]; - final Integer[] imgwidths = new Integer[images.size()]; - final Integer[] imgpixels = new Integer[images.size()]; - final String[] imgstubs = new String[images.size()]; - final String[] imgalts = new String[images.size()]; - int withalt = 0; - int i = 0; - LinkedHashSet images_text_map = new LinkedHashSet(); - for (final ImageEntry ie: images) { - final MultiProtocolURL uri = ie.url(); - inboundLinks.remove(uri); - outboundLinks.remove(uri); - imgheights[i] = ie.height(); - imgwidths[i] = ie.width(); - imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width(); - String protocol = uri.getProtocol(); - imgprots.add(protocol); - imgstubs[i] = uri.toString().substring(protocol.length() + 3); - imgalts[i] = ie.alt(); - for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it); - if (ie.alt() != null && ie.alt().length() > 0) { - SentenceReader sr = new SentenceReader(ie.alt()); - while (sr.hasNext()) images_text_map.add(sr.next().toString()); - withalt++; - } - i++; - } - StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1); - for (String s: images_text_map) images_text.append(s.trim()).append(' '); - if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size()); - if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); - if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); - if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts); - if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights); - if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths); - if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels); - if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt); - if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim()); + processImages(doc, allAttr, inboundLinks, outboundLinks, images); // style sheets if (allAttr || contains(CollectionSchema.css_tag_sxt)) { @@ -1031,6 +997,116 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return doc; } + /** + * Add icons metadata to Solr doc when corresponding schema attributes are + * enabled. Remove images urls from inboudLinks and outboundLinks. + * + * @param doc + * solr document to fill + * @param allAttr + * all attributes are enabled + * @param inboundLinks + * all document inbound links + * @param outboundLinks + * all document outbound links + * @param icons + * document icon entries + */ + private void processIcons(SolrVector doc, boolean allAttr, LinkedHashMap inboundLinks, + LinkedHashMap outboundLinks, Collection icons) { + final List protocols = new ArrayList(icons.size()); + final String[] sizes = new String[icons.size()]; + final String[] stubs = new String[icons.size()]; + final String[] rels = new String[icons.size()]; + int i = 0; + /* Prepare solr field values */ + for (final IconEntry ie : icons) { + final DigestURL url = ie.getUrl(); + + inboundLinks.remove(url); + outboundLinks.remove(url); + + String protocol = url.getProtocol(); + protocols.add(protocol); + + /* + * There may be multiple sizes and multiple rels for one icon : we + * store this as flat string as currently solr doesn't support + * multidimensionnal array fields + */ + sizes[i] = ie.sizesToString(); + stubs[i] = url.toString().substring(protocol.length() + 3); + rels[i] = ie.relToString(); + + i++; + } + if (allAttr || contains(CollectionSchema.icons_protocol_sxt)) { + add(doc, CollectionSchema.icons_protocol_sxt, protocolList2indexedList(protocols)); + } + if (allAttr || contains(CollectionSchema.icons_urlstub_sxt)) { + add(doc, CollectionSchema.icons_urlstub_sxt, stubs); + } + if (allAttr || contains(CollectionSchema.icons_rel_sxt)) { + add(doc, CollectionSchema.icons_rel_sxt, rels); + } + if (allAttr || contains(CollectionSchema.icons_sizes_sxt)) { + add(doc, CollectionSchema.icons_sizes_sxt, sizes); + } + } + + /** + * Add images metadata to Solr doc when corresponding schema attributes are enabled. + * Remove images urls from inboudLinks and outboundLinks. + * @param doc solr document to fill + * @param allAttr all attributes are enabled + * @param inboundLinks all document inbound links + * @param outboundLinks all document outbound links + * @param images document images + */ + private void processImages(SolrVector doc, boolean allAttr, LinkedHashMap inboundLinks, + LinkedHashMap outboundLinks, List images) { + final ArrayList imgprots = new ArrayList(images.size()); + final Integer[] imgheights = new Integer[images.size()]; + final Integer[] imgwidths = new Integer[images.size()]; + final Integer[] imgpixels = new Integer[images.size()]; + final String[] imgstubs = new String[images.size()]; + final String[] imgalts = new String[images.size()]; + int withalt = 0; + int i = 0; + LinkedHashSet images_text_map = new LinkedHashSet(); + /* Prepare flat solr field values */ + for (final ImageEntry ie: images) { + final MultiProtocolURL uri = ie.url(); + inboundLinks.remove(uri); + outboundLinks.remove(uri); + imgheights[i] = ie.height(); + imgwidths[i] = ie.width(); + imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width(); + String protocol = uri.getProtocol(); + imgprots.add(protocol); + imgstubs[i] = uri.toString().substring(protocol.length() + 3); + imgalts[i] = ie.alt(); + for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it); + if (ie.alt() != null && ie.alt().length() > 0) { + SentenceReader sr = new SentenceReader(ie.alt()); + while (sr.hasNext()) images_text_map.add(sr.next().toString()); + withalt++; + } + i++; + } + StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1); + for (String s: images_text_map) images_text.append(s.trim()).append(' '); + if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size()); + if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); + if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); + if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts); + if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights); + if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths); + if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels); + if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt); + if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim()); + } + /** * attach additional information to the document to enable navigation features * @param doc the document to be enriched @@ -1937,14 +2013,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return a; } + /** + * Uncompress indexed iplist of protocol names to a list of specified dimension. + * @param iplist indexed list typically produced by protocolList2indexedList + * @param dimension size of target list + * @return a list of protocol names + */ public static List indexedList2protocolList(Collection iplist, int dimension) { List a = new ArrayList(dimension); for (int i = 0; i < dimension; i++) a.add("http"); if (iplist == null) return a; for (Object ip : iplist) { // ip format is 001-https but can be 4 digits 1011-https - int i = ((String) ip).indexOf('-'); - a.set(Integer.parseInt(((String) ip).substring(0, i)), ((String) ip).substring(i+1)); + String indexedProtocol = ((String) ip); + int i = indexedProtocol.indexOf('-'); + /* Silently ignore badly formatted entry */ + if(i > 0 && indexedProtocol.length() > (i + 1)) { + a.set(Integer.parseInt(indexedProtocol.substring(0, i)), indexedProtocol.substring(i+1)); + } } return a; } diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index d97c7f00f..1b5e84b3b 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -139,6 +139,12 @@ public enum CollectionSchema implements SchemaDeclaration { outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "external links, the url only without the protocol"), outboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "external links, the visible anchor text"), + icons_urlstub_sxt(SolrType.string, true, true, true, false, true, "all icon links without the protocol and '://'"), + /** All icon links protocols : split from icons_urlstub to provide some compression, as http protocol is implied as default and not stored */ + icons_protocol_sxt(SolrType.string, true, true, true, false, false, "all icon links protocols"), + icons_rel_sxt(SolrType.string, true, true, true, false, false, "all icon links relationships space separated (e.g.. 'icon apple-touch-icon')"), + icons_sizes_sxt(SolrType.num_integer, true, true, true, false, false, "all icon sizes space separated (e.g. '16x16 32x32')"), + images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"), images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"), images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols"), diff --git a/test/java/net/yacy/document/parser/html/ContentScraperTest.java b/test/java/net/yacy/document/parser/html/ContentScraperTest.java new file mode 100644 index 000000000..8fad1d6d3 --- /dev/null +++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java @@ -0,0 +1,125 @@ +/** + * ContentScraperTest + * Copyright 2011 by Michael Peter Christen + * First released 28.04.2011 at http://yacy.net +* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.parser.html; + +import java.awt.Dimension; +import java.util.Set; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for ContentScrapper class. + * @author luc + * + */ +public class ContentScraperTest { + + @Test + public final void testParseSizes() { + /* Normal case */ + Set sizes = ContentScraper.parseSizes("96x128"); + Assert.assertEquals(1, sizes.size()); + Assert.assertTrue(sizes.contains(new Dimension(96, 128))); + + /* "any" keyword */ + sizes = ContentScraper.parseSizes("any"); + Assert.assertEquals(0, sizes.size()); + + /* Multiple valid sizes, lower and upper case separator */ + sizes = ContentScraper.parseSizes("96x128 16X16 1X2 1024x768"); + Assert.assertEquals(4, sizes.size()); + Assert.assertTrue(sizes.contains(new Dimension(96, 128))); + Assert.assertTrue(sizes.contains(new Dimension(16, 16))); + Assert.assertTrue(sizes.contains(new Dimension(1, 2))); + Assert.assertTrue(sizes.contains(new Dimension(1024, 768))); + + /* Duplicate entries */ + sizes = ContentScraper.parseSizes("96x128 96X128 1X2 96x128"); + Assert.assertEquals(2, sizes.size()); + Assert.assertTrue(sizes.contains(new Dimension(96, 128))); + Assert.assertTrue(sizes.contains(new Dimension(1, 2))); + + /* Mutiple inner and trailing spaces */ + sizes = ContentScraper.parseSizes(" 96x128 16X16 "); + Assert.assertEquals(2, sizes.size()); + Assert.assertTrue(sizes.contains(new Dimension(96, 128))); + Assert.assertTrue(sizes.contains(new Dimension(16, 16))); + + /* Empty string */ + sizes = ContentScraper.parseSizes(""); + Assert.assertEquals(0, sizes.size()); + + /* null string */ + sizes = ContentScraper.parseSizes(null); + Assert.assertEquals(0, sizes.size()); + + /* Invalid sizes */ + sizes = ContentScraper.parseSizes("096x0128 -16x-16 0x0 x768 78x axb 1242"); + Assert.assertEquals(0, sizes.size()); + + /* Mix of valid and invalid sizes */ + sizes = ContentScraper.parseSizes("96x128 16X16 axb 123 78x32"); + Assert.assertEquals(3, sizes.size()); + Assert.assertTrue(sizes.contains(new Dimension(96, 128))); + Assert.assertTrue(sizes.contains(new Dimension(16, 16))); + Assert.assertTrue(sizes.contains(new Dimension(78, 32))); + } + + @Test + public final void testParseSpaceSeparatedTokens() { + /* Normal case */ + Set tokens = ContentScraper.parseSpaceSeparatedTokens("abc de"); + Assert.assertEquals(2, tokens.size()); + Assert.assertTrue(tokens.contains("abc")); + Assert.assertTrue(tokens.contains("de")); + + /* One item only */ + tokens = ContentScraper.parseSpaceSeparatedTokens("abc"); + Assert.assertEquals(1, tokens.size()); + Assert.assertTrue(tokens.contains("abc")); + + /* Mutiple inner and trailing spaces */ + tokens = ContentScraper.parseSpaceSeparatedTokens(" abc d efff fgj "); + Assert.assertEquals(4, tokens.size()); + Assert.assertTrue(tokens.contains("abc")); + Assert.assertTrue(tokens.contains("d")); + Assert.assertTrue(tokens.contains("efff")); + Assert.assertTrue(tokens.contains("fgj")); + + /* Duplicate entries */ + tokens = ContentScraper.parseSpaceSeparatedTokens("abc bb abc abc ABC"); + Assert.assertEquals(3, tokens.size()); + Assert.assertTrue(tokens.contains("abc")); + /* ignoring case is not the purpose of this function */ + Assert.assertTrue(tokens.contains("ABC")); + Assert.assertTrue(tokens.contains("bb")); + + /* Empty string */ + tokens = ContentScraper.parseSpaceSeparatedTokens(""); + Assert.assertEquals(0, tokens.size()); + + /* Null string */ + tokens = ContentScraper.parseSpaceSeparatedTokens(null); + Assert.assertEquals(0, tokens.size()); + } + +} diff --git a/test/java/net/yacy/document/parser/html/IconEntryTest.java b/test/java/net/yacy/document/parser/html/IconEntryTest.java new file mode 100644 index 000000000..313e6d377 --- /dev/null +++ b/test/java/net/yacy/document/parser/html/IconEntryTest.java @@ -0,0 +1,192 @@ +/** + * IconEntryTest + * Copyright 2011 by Michael Peter Christen + * First released 28.04.2011 at http://yacy.net +* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.parser.html; + +import java.awt.Dimension; +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; + +import org.junit.Assert; +import org.junit.Test; + +import net.yacy.cora.document.id.DigestURL; + +/** + * Unit tests for IconEntry class. + * @author luc + * + */ +public class IconEntryTest { + + @Test + public final void testGetDistance() { + /* Normal case : one size has both width and height greater */ + Dimension size1 = new Dimension(5, 8); + Dimension size2 = new Dimension(7, 12); + Assert.assertEquals(3.0, IconEntry.getDistance(size1, size2), 0.0); + /* Check inverted parameters should produces same result */ + Assert.assertEquals(3.0, IconEntry.getDistance(size2, size1), 0.0); + /* Equal sizes */ + size2 = new Dimension(5, 8); + Assert.assertEquals(0.0, IconEntry.getDistance(size1, size2), 0.0); + /* Equal sizes */ + size2 = new Dimension(5, 8); + Assert.assertEquals(0.0, IconEntry.getDistance(size1, size2), 0.0); + /* Only one dimension differs */ + size2 = new Dimension(5, 12); + Assert.assertEquals(2.0, IconEntry.getDistance(size1, size2), 0.0); + size2 = new Dimension(10, 8); + Assert.assertEquals(2.5, IconEntry.getDistance(size1, size2), 0.0); + /* width lower, height upper */ + size2 = new Dimension(3, 12); + Assert.assertEquals(3.0, IconEntry.getDistance(size1, size2), 0.0); + /* negative values */ + size1 = new Dimension(-5, -8); + size2 = new Dimension(-7, -12); + Assert.assertEquals(3.0, IconEntry.getDistance(size1, size2), 0.0); + /* one null */ + size1 = null; + size2 = new Dimension(-7, -12); + Assert.assertEquals(Double.MAX_VALUE, IconEntry.getDistance(size1, size2), 0.0); + } + + @Test + public final void testGetClosestSize() throws MalformedURLException { + /* Preferred size in sizes set */ + Set rels = new HashSet<>(); + rels.add(IconLinkRelations.ICON.getRelValue()); + + Set sizes = new HashSet<>(); + sizes.add(new Dimension(128,128)); + sizes.add(new Dimension(256,512)); + sizes.add(new Dimension(16,16)); + + Dimension preferredSize = new Dimension(16, 16); + IconEntry icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + Dimension result = icon.getClosestSize(preferredSize); + Assert.assertEquals(preferredSize, result); + + /* Preferred size lower than all sizes in set */ + preferredSize = new Dimension(12, 12); + result = icon.getClosestSize(preferredSize); + Assert.assertEquals(new Dimension(16,16), result); + + /* Preferred size over than all sizes in set */ + preferredSize = new Dimension(1992, 1024); + result = icon.getClosestSize(preferredSize); + Assert.assertEquals(new Dimension(256, 512), result); + + /* Preferred size between sizes in set */ + preferredSize = new Dimension(17, 18); + result = icon.getClosestSize(preferredSize); + Assert.assertEquals(new Dimension(16, 16), result); + + /* Sizes set contains only one item */ + sizes = new HashSet<>(); + sizes.add(new Dimension(128,128)); + icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + preferredSize = new Dimension(1992, 1024); + result = icon.getClosestSize(preferredSize); + Assert.assertEquals(new Dimension(128, 128), result); + + /* Empty sizes set */ + sizes = new HashSet<>(); + icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + preferredSize = new Dimension(16, 16); + result = icon.getClosestSize(preferredSize); + Assert.assertNull(result); + + /* Null preferred size */ + sizes = new HashSet<>(); + sizes.add(new Dimension(128,128)); + sizes.add(new Dimension(256,512)); + sizes.add(new Dimension(16,16)); + icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + preferredSize = null; + result = icon.getClosestSize(preferredSize); + Assert.assertNull(result); + } + + @Test + public final void testSizesToString() throws MalformedURLException { + /* Multiple values in sizes set */ + Set rels = new HashSet<>(); + rels.add(IconLinkRelations.ICON.getRelValue()); + + Set sizes = new HashSet<>(); + sizes.add(new Dimension(128,128)); + sizes.add(new Dimension(256,512)); + sizes.add(new Dimension(16,16)); + + IconEntry icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + String sizesStr = icon.sizesToString(); + /* The set is not ordered, only check result contains what we expect */ + Assert.assertTrue(sizesStr.contains("128x128")); + Assert.assertTrue(sizesStr.contains("256x512")); + Assert.assertTrue(sizesStr.contains("16x16")); + Assert.assertTrue(sizesStr.contains(" ")); + + /* One value in sizes set */ + sizes = new HashSet<>(); + sizes.add(new Dimension(128,128)); + + icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + sizesStr = icon.sizesToString(); + Assert.assertEquals("128x128", sizesStr); + + /* Empty sizes set */ + sizes = new HashSet<>(); + + icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + sizesStr = icon.sizesToString(); + Assert.assertTrue(sizesStr.isEmpty()); + } + + @Test + public final void testRelToString() throws MalformedURLException { + /* Multiple values in rel set */ + Set rels = new HashSet<>(); + rels.add(IconLinkRelations.ICON.getRelValue()); + rels.add(IconLinkRelations.APPLE_TOUCH_ICON.getRelValue()); + rels.add(IconLinkRelations.MASK_ICON.getRelValue()); + + Set sizes = new HashSet<>(); + sizes.add(new Dimension(128,128)); + + IconEntry icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + String relStr = icon.relToString(); + /* The set is not ordered, only check result contains what we expect */ + Assert.assertTrue(relStr.contains(IconLinkRelations.ICON.getRelValue())); + Assert.assertTrue(relStr.contains(IconLinkRelations.APPLE_TOUCH_ICON.getRelValue())); + Assert.assertTrue(relStr.contains(IconLinkRelations.MASK_ICON.getRelValue())); + Assert.assertTrue(relStr.contains(" ")); + + /* One value in rel set */ + rels = new HashSet<>(); + rels.add(IconLinkRelations.ICON.getRelValue()); + + icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes); + relStr = icon.relToString(); + Assert.assertEquals(IconLinkRelations.ICON.getRelValue(), relStr); + } + +} diff --git a/test/java/net/yacy/kelondro/data/meta/URIMetadataNodeTest.java b/test/java/net/yacy/kelondro/data/meta/URIMetadataNodeTest.java new file mode 100644 index 000000000..cf56030cd --- /dev/null +++ b/test/java/net/yacy/kelondro/data/meta/URIMetadataNodeTest.java @@ -0,0 +1,157 @@ +/** + * URIMetadataNodeTest + * Copyright 2011 by Michael Peter Christen + * First released 28.04.2011 at http://yacy.net +* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.kelondro.data.meta; + +import java.awt.Dimension; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.document.parser.html.IconEntry; +import net.yacy.search.schema.CollectionConfiguration; +import net.yacy.search.schema.CollectionSchema; + +/** + * Unit tests for URIMetadataNode class. + * + * @author luc + * + */ +public class URIMetadataNodeTest { + + /** + * Three standard icons with diferrent sizes, one non-standard with a larger + * size + */ + @Test + public final void testGetIcons4Items() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode + .setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "somehost.org/static/images/icon16.png", "somehost.org/static/images/icon32.png", + "somehost.org/static/images/icon64.png", + "somehost.org/static/images/iconApple128.png" }); + List protocols = CollectionConfiguration + .protocolList2indexedList(Arrays.asList(new String[] { "http", "https", "https", "http" })); + metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols); + metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), + new String[] { "icon", "icon", "icon", "apple-touch-icon" }); + metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(), + new String[] { "16x24", "32x32", "58x64", "128x128" }); + + Collection icons = metadataNode.getIcons(); + int nb = 0; + /* Check results consistency */ + for(IconEntry icon : icons) { + if("http://somehost.org/static/images/icon16.png".equals(icon.getUrl().toNormalform(false))) { + Assert.assertEquals(1, icon.getSizes().size()); + Dimension size = icon.getSizes().iterator().next(); + Assert.assertEquals(16, size.width); + Assert.assertEquals(24, size.height); + Assert.assertEquals(1, icon.getRel().size()); + Assert.assertEquals("icon", icon.getRel().iterator().next()); + nb++; + } else if("https://somehost.org/static/images/icon32.png".equals(icon.getUrl().toNormalform(false))) { + Assert.assertEquals(1, icon.getSizes().size()); + Dimension size = icon.getSizes().iterator().next(); + Assert.assertEquals(32, size.width); + Assert.assertEquals(32, size.height); + Assert.assertEquals(1, icon.getRel().size()); + Assert.assertEquals("icon", icon.getRel().iterator().next()); + nb++; + } else if("https://somehost.org/static/images/icon64.png".equals(icon.getUrl().toNormalform(false))) { + Assert.assertEquals(1, icon.getSizes().size()); + Dimension size = icon.getSizes().iterator().next(); + Assert.assertEquals(58, size.width); + Assert.assertEquals(64, size.height); + Assert.assertEquals(1, icon.getRel().size()); + Assert.assertEquals("icon", icon.getRel().iterator().next()); + nb++; + } else if("http://somehost.org/static/images/iconApple128.png".equals(icon.getUrl().toNormalform(false))) { + Assert.assertEquals(1, icon.getSizes().size()); + Dimension size = icon.getSizes().iterator().next(); + Assert.assertEquals(128, size.width); + Assert.assertEquals(128, size.height); + Assert.assertEquals(1, icon.getRel().size()); + Assert.assertEquals("apple-touch-icon", icon.getRel().iterator().next()); + nb++; + } + } + Assert.assertEquals(4, nb); + } + + /** + * Only icons_urlstub_sxt field valued + */ + @Test + public final void testGetIconsOnlyIconsUrlstubSxt() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode + .setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "somehost.org/static/images/icon16.png", "somehost.org/static/images/icon32.png", + "somehost.org/static/images/icon64.png", + "somehost.org/static/images/iconApple124.png" }); + + Collection icons = metadataNode.getIcons(); + Assert.assertEquals(4, icons.size()); + + } + + /** + * Only one standard icon + */ + @Test + public final void testGetIcons1Item() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "somehost.org/static/images/icon16.png" }); + List protocols = CollectionConfiguration + .protocolList2indexedList(Arrays.asList(new String[] { "http" })); + metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols); + metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), new String[] { "icon" }); + metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(), new String[] { "16x16" }); + + Collection icons = metadataNode.getIcons(); + Assert.assertEquals(1, icons.size()); + IconEntry icon = icons.iterator().next(); + Assert.assertEquals(1, icon.getSizes().size()); + Dimension size = icon.getSizes().iterator().next(); + Assert.assertEquals(16.0, size.getWidth(), 0.0); + Assert.assertEquals(16.0, size.getHeight(), 0.0); + } + + /** + * No Icon + */ + @Test + public final void testGetIconsNoIcon() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + + Collection icons = metadataNode.getIcons(); + Assert.assertEquals(0, icons.size()); + } + +} diff --git a/test/java/yacysearchitemTest.java b/test/java/yacysearchitemTest.java new file mode 100644 index 000000000..125b52047 --- /dev/null +++ b/test/java/yacysearchitemTest.java @@ -0,0 +1,187 @@ + +/** + * URIMetadataNodeTest + * Copyright 2011 by Michael Peter Christen + * First released 28.04.2011 at http://yacy.net +* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +import java.awt.Dimension; +import java.net.MalformedURLException; +import java.util.Arrays; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.schema.CollectionConfiguration; +import net.yacy.search.schema.CollectionSchema; + +/** + * Unit tests for yacysearchitem class. + * + * @author luc + * + */ +public class yacysearchitemTest { + + /** + * Three standard icons with diferrent sizes, one non-standard with a larger + * size + * + * @throws MalformedURLException + */ + @Test + public final void testGetFaviconURL() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode + .setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "someHost.org/static/images/icon16.png", "somehost.org/static/images/icon32.png", + "somehost.org/static/images/icon64.png", + "somehost.org/static/images/iconApple124.png" }); + List protocols = CollectionConfiguration + .protocolList2indexedList(Arrays.asList(new String[] { "http", "http", "http", "http" })); + metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols); + metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), + new String[] { "icon", "icon", "icon", "apple-touch-icon" }); + metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(), + new String[] { "16x16", "32x32", "64x64", "128x128" }); + + /* Search for a size present in icons collection */ + DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(32, 32)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/icon32.png", faviconURL.toNormalform(false)); + + /* Search for a size not in icons collection */ + faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(40, 40)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/icon32.png", faviconURL.toNormalform(false)); + + /* + * Search for a size equals to non-standard : standard icon is stil + * preffered + */ + faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(128, 128)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/icon64.png", faviconURL.toNormalform(false)); + } + + /** + * Only non-standard icons + * + * @throws MalformedURLException + */ + @Test + public final void testGetFaviconURLNonStandard() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode + .setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "somehost.org/static/images/mask32.png", + "somehost.org/static/images/fluid.64.png", + "somehost.org/static/images/iconApple124.png" }); + List protocols = CollectionConfiguration + .protocolList2indexedList(Arrays.asList(new String[] { "http", "http", "http" })); + metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols); + metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), + new String[] { "mask-icon", "fluid-icon", "apple-touch-icon" }); + metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(), + new String[] { "32x32", "64x64", "128x128" }); + + /* Non standard icon is returned as fallback */ + DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(32, 32)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/mask32.png", faviconURL.toNormalform(false)); + } + + /** + * One standard icon with multiple sizes + * + * @throws MalformedURLException + */ + @Test + public final void testGetFaviconURLMultiSizes() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode + .setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "somehost.org/static/images/favicon.ico"}); + List protocols = CollectionConfiguration + .protocolList2indexedList(Arrays.asList(new String[] { "http"})); + metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols); + metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), + new String[] { "icon"}); + metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(), + new String[] { "16x16 32x32 64x64",}); + + /* Search for a size in sizes set */ + DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(32, 32)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/favicon.ico", faviconURL.toNormalform(false)); + + /* Search for a size not in sizes set */ + faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(40, 40)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/favicon.ico", faviconURL.toNormalform(false)); + } + + /** + * One standard icon with no size + * + * @throws MalformedURLException + */ + @Test + public final void testGetFaviconURLNoSize() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org")); + metadataNode + .setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(), + new String[] { "somehost.org/static/images/favicon.ico"}); + List protocols = CollectionConfiguration + .protocolList2indexedList(Arrays.asList(new String[] { "http"})); + metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols); + metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), + new String[] { "icon"}); + + DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(32, 32)); + Assert.assertNotNull(faviconURL); + Assert.assertEquals("http://somehost.org/static/images/favicon.ico", faviconURL.toNormalform(false)); + } + + + /** + * No icon in document + * + * @throws MalformedURLException + */ + @Test + public final void testGetFaviconURLNoIcon() throws MalformedURLException { + URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://someHost.org")); + + /* Default fallback favicon URL should be generated */ + DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode, + new Dimension(32, 32)); + Assert.assertEquals("http://somehost.org/favicon.ico", faviconURL.toNormalform(false)); + } + +}