Improved HTML icons indexing and rendering in search results.

See http://mantis.tokeek.de/view.php?id=629
pull/39/head
luc 9 years ago
parent edef6cd0dc
commit 3cc5619d93

@ -278,6 +278,18 @@ outboundlinks_urlstub_sxt
## external links, the visible anchor text
outboundlinks_anchortext_txt
## all icon links without the protocol and '://'
icons_urlstub_sxt
## all icon links protocols : split from icons_urlstub to provide some compression, as http protocol is implied as default and not stored
icons_protocol_sxt
## all icon links relationships space separated (e.g. 'icon apple-touch-icon')
icons_rel_sxt
## all icon sizes space separated (e.g. '16x16 32x32')
icons_sizes_sxt
## all text/words appearing in image alt texts or the tokenized url
images_text_t

@ -35,6 +35,11 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -47,11 +52,6 @@ import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class getpageinfo {
@ -110,8 +110,13 @@ public class getpageinfo {
// put the document title
prop.putXML("title", removelinebreaks(scraper.dc_title()));
DigestURL favicon = null;
if (scraper.getIcons() != null && !scraper.getIcons().isEmpty()) {
favicon = scraper.getIcons().keySet().iterator().next();
}
// put the favicon that belongs to the document
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
prop.put("favicon", (favicon == null) ? "" : favicon.toString());
// put keywords
final Set<String> list = scraper.dc_subject();

@ -35,6 +35,11 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -47,11 +52,6 @@ import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class getpageinfo_p {
@ -110,8 +110,13 @@ public class getpageinfo_p {
// put the document title
prop.putXML("title", scraper.dc_title());
DigestURL favicon = null;
if (scraper.getIcons() != null && !scraper.getIcons().isEmpty()) {
favicon = scraper.getIcons().keySet().iterator().next();
}
// put the favicon that belongs to the document
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
prop.put("favicon", (favicon == null) ? "" : favicon.toString());
// put keywords
final Set<String> list = scraper.dc_subject();

@ -24,6 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.awt.Dimension;
import java.io.File;
import java.net.MalformedURLException;
import java.util.Collection;
@ -48,6 +49,7 @@ import net.yacy.crawler.data.Transactions;
import net.yacy.crawler.data.Transactions.State;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.URLLicense;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.Formatter;
import net.yacy.peers.NewsPool;
@ -127,14 +129,6 @@ public class yacysearchitem {
final DigestURL resultURL = result.url();
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final int port = resultURL.getPort();
DigestURL faviconURL = null;
if ((fileType == FileType.HTML || fileType == FileType.JSON) && !sb.isIntranetMode()) try {
faviconURL = new DigestURL(resultURL.getProtocol() + "://" + resultURL.getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico");
} catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1);
faviconURL = null;
}
final String resource = theSearch.query.domType.toString();
final String origQ = theSearch.query.getQueryGoal().getQueryString(true);
prop.put("content", 1); // switch on specific content
@ -194,6 +188,7 @@ public class yacysearchitem {
boolean isAtomFeed = header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("atom");
String resultFileName = resultURL.getFileName();
prop.putHTML("content_target", target);
DigestURL faviconURL = getFaviconURL(sb.isIntranetMode(), fileType, result, new Dimension(16, 16));
prop.putHTML("content_faviconUrl", processFaviconURL(authenticated, faviconURL));
prop.put("content_urlhash", urlhash);
prop.put("content_ranking", Float.toString(result.score()));
@ -342,6 +337,68 @@ public class yacysearchitem {
return prop;
}
/**
* Tries to retrieve favicon url from solr result document, or generates
* default favicon URL (i.e. "http://host/favicon.ico") from resultURL and
* port.
*
* @param isIntranetMode
* when true returns null
* @param fileType
* file type result as specified in request header
* @param result
* solr document result. Must not be null.
* @param preferredSize preferred icon size. If no one matches, most close icon is returned.
* @return favicon URL or null when even default favicon URL can not be generated
* @throws NullPointerException when one requested parameter is null
*/
protected static DigestURL getFaviconURL(final boolean isIntranetMode, final RequestHeader.FileType fileType,
final URIMetadataNode result, Dimension preferredSize) {
DigestURL faviconURL = null;
if ((fileType == FileType.HTML || fileType == FileType.JSON) && !isIntranetMode) {
try {
String defaultFaviconURL = result.url().getProtocol() + "://" + result.url().getHost()
+ ((result.url().getPort() != -1) ? (":" + result.url().getPort()) : "") + "/favicon.ico";
IconEntry faviconEntry = null;
/* We look preferably for a standard icon with preferred size, but accept as a fallback other icons below 128x128 or with no known size*/
boolean foundStandard = false;
double closestDistance = Double.MAX_VALUE;
for(IconEntry icon : result.getIcons()) {
boolean isStandard = icon.isStandardIcon();
double distance = IconEntry.getDistance(icon.getClosestSize(preferredSize), preferredSize);
boolean match = false;
if(foundStandard) {
/* Already found a standard icon : now must find a standard icon with closer size */
match = isStandard && distance < closestDistance;
} else {
/* No standard icon yet found : prefer a standard icon, or check size */
match = isStandard || distance < closestDistance;
}
if(match) {
faviconEntry = icon;
closestDistance = distance;
foundStandard = isStandard;
if(isStandard && distance == 0.0) {
break;
}
}
}
if (faviconEntry == null) {
faviconURL = new DigestURL(defaultFaviconURL);
} else {
faviconURL = faviconEntry.getUrl();
}
} catch (final MalformedURLException e1) {
ConcurrentLog.logException(e1);
faviconURL = null;
}
}
return faviconURL;
}
/**
* @param authenticated
* true when current user is authenticated

@ -32,7 +32,6 @@ import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
@ -62,6 +61,7 @@ import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.util.FileUtils;
@ -87,7 +87,8 @@ public class Document {
private LinkedHashMap<AnchorURL, String> audiolinks, videolinks, applinks, hyperlinks; // TODO: check if redundant value (set to key.getNameProperty()) is needed
private LinkedHashMap<DigestURL, String> inboundlinks, outboundlinks;
private Set<AnchorURL> emaillinks; // mailto: links
private MultiProtocolURL favicon;
/** links to icons that belongs to the document (mapped by absolute URL) */
private Map<DigestURL, IconEntry> icons;
private boolean resorted;
private final Set<String> languages;
private boolean indexingDenied;
@ -139,6 +140,7 @@ public class Document {
this.videolinks = null;
this.applinks = null;
this.emaillinks = null;
this.icons = new HashMap<>();
this.resorted = false;
this.inboundlinks = null;
this.outboundlinks = null;
@ -576,6 +578,7 @@ dc_rights
// that can be calculated from given hyperlinks and imagelinks
this.hyperlinks.putAll(allReflinks(this.images.values()));
this.hyperlinks.putAll(allReflinks(this.icons.keySet()));
this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
this.hyperlinks.putAll(allReflinks(this.videolinks.keySet()));
this.hyperlinks.putAll(allReflinks(this.applinks.keySet()));
@ -658,6 +661,8 @@ dc_rights
url = new AnchorURL((String) o);
else if (o instanceof ImageEntry)
url = new AnchorURL(((ImageEntry) o).url());
else if (o instanceof IconEntry)
url = new AnchorURL(((IconEntry) o).getUrl());
else {
assert false;
continue loop;
@ -720,18 +725,24 @@ dc_rights
}
/**
* @return the {@link URL} to the favicon that belongs to the document
* @return links to icons that belongs to the document (mapped by absolute URL)
*/
public MultiProtocolURL getFavicon() {
return this.favicon;
}
public Map<DigestURL, IconEntry> getIcons() {
return icons;
}
/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
* Set links to icons that belongs to the document (mapped by absolute URL)
* @param icons
*/
public void setFavicon(final MultiProtocolURL faviconURL) {
this.favicon = faviconURL;
}
public void setIcons(Map<DigestURL, IconEntry> icons) {
/* Better to ensure now icons property will not be null */
if(icons != null) {
this.icons = icons;
} else {
this.icons = new HashMap<>();
}
}
public int inboundLinkNofollowCount() {
if (this.inboundlinks == null) resortLinks();
@ -836,9 +847,13 @@ dc_rights
}
/**
* merge documents: a helper method for all parsers that return multiple documents
* @param docs
* @return
* merge documents: a helper method for all parsers that return multiple documents.
* Note : when docs contains more than one item, eventual icons in each docs are not merged in result doc,
* as their scope is limited to only one document.
* @param location url of merged document
* @param globalMime Mime type of merged document
* @param docs documents to merge
* @return document resulting of merge, or original document when docs contains only one item.
*/
public static Document mergeDocuments(final DigestURL location, final String globalMime, final Document[] docs) {
if (docs == null || docs.length == 0) return null;

@ -20,6 +20,7 @@
package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.io.ByteArrayInputStream;
import java.io.CharArrayReader;
import java.io.File;
@ -31,12 +32,16 @@ import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@ -52,6 +57,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.storage.SizeLimitedMap;
import net.yacy.cora.storage.SizeLimitedSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
@ -66,7 +72,7 @@ import net.yacy.kelondro.util.ISO639;
public class ContentScraper extends AbstractScraper implements Scraper {
private final static int MAX_TAGSIZE = 1024 * 1024;
public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
public static final int MAX_DOCSIZE = 40 * 1024 * 1024;
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "&#039;".toCharArray();
@ -194,10 +200,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private int breadcrumbs;
/**
* {@link MultiProtocolURL} to the favicon that belongs to the document
*/
private MultiProtocolURL favicon;
/** links to icons that belongs to the document (mapped by absolute URL)*/
private final Map<DigestURL, IconEntry> icons;
/**
* The document root {@link MultiProtocolURL}
@ -230,6 +234,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks);
this.anchors = new ArrayList<AnchorURL>();
this.images = new ArrayList<ImageEntry>();
this.icons = new HashMap<>();
this.embeds = new SizeLimitedMap<AnchorURL, EmbedEntry>(maxLinks);
this.frames = new SizeLimitedSet<AnchorURL>(maxLinks);
this.iframes = new SizeLimitedSet<AnchorURL>(maxLinks);
@ -406,6 +411,69 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
/**
* Parses sizes icon link attribute. (see
* http://www.w3.org/TR/html5/links.html#attr-link-sizes) Eventual
* duplicates are removed.
*
* @param sizesAttr
* sizes attribute string, may be null
* @return a set of sizes eventually empty.
*/
public static Set<Dimension> parseSizes(String sizesAttr) {
Set<Dimension> sizes = new HashSet<Dimension>();
Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr);
for (String token : tokens) {
/*
* "any" keyword may be present, but doesn't have to produce a
* dimension result
*/
if (token != null) {
Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token);
if (matcher.matches()) {
/* With given pattern no NumberFormatException can occur */
sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
}
}
}
return sizes;
}
/**
* Parses a space separated tokens attribute value (see
* http://www.w3.org/TR/html5/infrastructure.html#space-separated-tokens).
* Eventual duplicates are removed.
*
* @param attr
* attribute string, may be null
* @return a set of tokens eventually empty
*/
public static Set<String> parseSpaceSeparatedTokens(String attr) {
Set<String> tokens = new HashSet<>();
/* Check attr string is not empty to avoid adding a single empty string
* in result */
if (attr != null && !attr.trim().isEmpty()) {
String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
Collections.addAll(tokens, items);
}
return tokens;
}
/**
* Retain only icon relations (standard and non standard) from tokens .
* @param relTokens relationship tokens (parsed from a rel attribute)
* @return a Set of icon relations, eventually empty
*/
public Set<String> retainIconRelations(Collection<String> relTokens) {
HashSet<String> iconRels = new HashSet<>();
for(String token : relTokens) {
if(IconLinkRelations.isIconRel(token)) {
iconRels.add(token.toLowerCase(Locale.ENGLISH));
}
}
return iconRels;
}
@Override
public void scrapeTag0(Tag tag) {
checkOpts(tag);
@ -473,14 +541,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (newLink != null) {
tag.opts.put("href", newLink.toNormalform(true));
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
/* Rel attribute is supposed to be a set of space-separated tokens */
Set<String> relTokens = parseSpaceSeparatedTokens(rel);
final String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
final String type = tag.opts.getProperty("type", EMPTY_STRING);
final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);
if (rel.equalsIgnoreCase("shortcut icon") || rel.equalsIgnoreCase("icon")) { // html5 -> rel="icon")
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
this.images.add(ie);
this.favicon = newLink;
Set<String> iconRels = retainIconRelations(relTokens);
/* Distinguish icons from images. It will enable for example to later search only images and no icons */
if (!iconRels.isEmpty()) {
String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
Set<Dimension> sizes = parseSizes(sizesAttr);
IconEntry icon = this.icons.get(newLink);
/* There is already an icon with same URL for this document :
* they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */
if(icon != null) {
icon.getRel().addAll(iconRels);
icon.getSizes().addAll(sizes);
} else {
icon = new IconEntry(newLink, iconRels, sizes);
this.icons.put(newLink, icon);
}
} else if (rel.equalsIgnoreCase("canonical")) {
tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
newLink.setAll(tag.opts);
@ -879,10 +961,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
/**
* @return the {@link MultiProtocolURL} to the favicon that belongs to the document
* @return all icons links
*/
public MultiProtocolURL getFavicon() {
return this.favicon;
public Map<DigestURL, IconEntry> getIcons() {
return this.icons;
}
/*
@ -1096,6 +1178,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.iframes.clear();
this.embeds.clear();
this.images.clear();
this.icons.clear();
this.metas.clear();
this.hreflang.clear();
this.navigation.clear();

@ -0,0 +1,198 @@
/**
* IconEntry
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
/**
* Represents an icon in a document.
*
* @author luc
*
*/
public class IconEntry {
/** Patern to parse a HTML link sizes token attribute (ie. "16x16") */
public static final Pattern SIZE_PATTERN = Pattern.compile("([1-9][0-9]*)[xX]([1-9][0-9]*)");
/** Icon URL */
private final DigestURL url;
/**
* Icon links relations (one url may be used as multiple icon relations in
* the same document)
*/
private final Set<String> rel;
/** Icon sizes */
private final Set<Dimension> sizes;
/**
* Constructs instance from parameters.
*
* @param url
* must not be null.
* @param rel
* must not be null and contain at least one item.
* @param sizes
* optional.
*/
public IconEntry(final DigestURL url, Set<String> rel, Set<Dimension> sizes) {
if (url == null) {
throw new IllegalArgumentException("url must not be null.");
}
if (rel == null || rel.isEmpty()) {
throw new IllegalArgumentException("rel must be specified");
}
this.url = url;
this.rel = rel;
if (sizes != null) {
this.sizes = sizes;
} else {
this.sizes = new HashSet<>();
}
}
/**
* @return true when rel property contains a standard IANA registered icon
* link relation
*/
public boolean isStandardIcon() {
boolean standard = false;
for (String relation : this.rel) {
if (IconLinkRelations.isStandardIconRel(relation)) {
standard = true;
break;
}
}
return standard;
}
/**
* @param size1
* @param size2
* @return distance between two sizes, or Double.MAX_VALUE when one size is null
*/
public static double getDistance(Dimension size1, Dimension size2) {
double result = Double.MAX_VALUE;
if(size1 != null && size2 != null) {
result = (Math.abs(size1.width - size2.width) + Math.abs(size1.height - size2.height)) / 2.0;
}
return result;
}
/**
* @param preferredSize
* @return the size among sizes property which is the closest to
* preferredSize, or null when sizes is empty or preferredSize is null.
*/
public Dimension getClosestSize(Dimension preferredSize) {
Dimension closest = null;
if (preferredSize != null) {
double closestDistance = Double.MAX_VALUE;
for (Dimension size : this.sizes) {
double currentDistance = getDistance(size, preferredSize);
if (closest == null) {
closest = size;
closestDistance = currentDistance;
} else {
if (currentDistance < closestDistance) {
closest = size;
closestDistance = currentDistance;
}
}
}
}
return closest;
}
@Override
public String toString() {
StringBuilder res = new StringBuilder();
res.append("<link");
res.append(" href=\"").append(this.url.toNormalform(false)).append("\"");
res.append(" rel=\"");
res.append(relToString());
res.append("\"");
if (!this.sizes.isEmpty()) {
res.append(" sizes=\"");
res.append(sizesToString());
res.append("\"");
}
res.append(">");
return res.toString();
}
/**
* @return icon URL
*/
public DigestURL getUrl() {
return url;
}
/**
* @return icons link relations
*/
public Set<String> getRel() {
return rel;
}
/**
* @return icon eventual sizes
*/
public Set<Dimension> getSizes() {
return sizes;
}
/**
* @return a string representation of sizes property, in the form of a valid
* HTML link tag sizes attribute (e.g. "16x16 64x64")
*/
public String sizesToString() {
StringBuilder builder = new StringBuilder();
for (Dimension size : this.sizes) {
if (builder.length() > 0) {
builder.append(" ");
}
builder.append(size.width).append("x").append(size.height);
}
return builder.toString();
}
/**
* @return a string representation of rel property, int the form of a valid
* HTML link tag rel attribute (e.g. "icon apple-touch-icon")
*/
public String relToString() {
StringBuilder builder = new StringBuilder();
for (String relation : this.rel) {
if (builder.length() > 0) {
builder.append(" ");
}
builder.append(relation);
}
return builder.toString();
}
}

@ -0,0 +1,92 @@
/**
* IconLinkRelations
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08 Mrz 2011) $
* $LastChangedRevision: 7567 $
* $LastChangedBy: low012 $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
/**
* Enumeration of HTML link relationships (rel attribute) designating icon.
* @author luc
*
*/
public enum IconLinkRelations {
/** Standard IANA registered icon link relation (see https://www.iana.org/assignments/link-relations/link-relations.xhtml) */
ICON("icon", "Standard favicon"),
/** Icon for IOS app shortcut */
APPLE_TOUCH_ICON("apple-touch-icon", "IOS app shortcut icon"),
/** Icon for IOS app shortcut (deprecated but still used by major websites in 2015) */
APPLE_TOUCH_ICON_PRECOMPOSED("apple-touch-icon-precomposed", "Deprecated IOS app shortcut icon"),
/** icon for Safari pinned tab */
MASK_ICON("mask-icon", "Safari browser pinned tab icon"),
/** Icon for Fluid web app */
FLUID_ICON("fluid-icon", "Fluid app icon");
/** HTML rel attribute value */
private String relValue;
/** Human readable description */
private String description;
private IconLinkRelations(String relValue, String description) {
this.relValue = relValue;
this.description = description;
}
/**
* @return HTML rel attribute value
*/
public String getRelValue() {
return relValue;
}
/**
* @return Human readable description of icon rel attribute
*/
public String getDescription() {
return description;
}
/**
* @param relToken HTML rel attribute token
* @return true when relToken is an icon relationship (standard or non-standard)
*/
public static boolean isIconRel(String relToken) {
boolean res = false;
for(IconLinkRelations iconRel : IconLinkRelations.values()) {
if(iconRel.getRelValue().equalsIgnoreCase(relToken)) {
res = true;
break;
}
}
return res;
}
/**
* @param relToken HTML rel attribute token
* @return true when relToken is Standard IANA registered icon link relation
*/
public static boolean isStandardIconRel(String relToken) {
return ICON.getRelValue().equalsIgnoreCase(relToken);
}
}

@ -166,7 +166,7 @@ public class htmlParser extends AbstractParser implements Parser {
noDoubleImages,
scraper.indexingDenied(),
scraper.getDate());
ppd.setFavicon(scraper.getFavicon());
ppd.setIcons(scraper.getIcons());
return ppd;
}

@ -22,18 +22,24 @@
package net.yacy.kelondro.data.meta;
import java.awt.Dimension;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.analysis.Classification;
@ -49,6 +55,8 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
@ -64,8 +72,6 @@ import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.utils.crypt;
import org.apache.solr.common.SolrDocument;
/**
* This is the URIMetadata object implementation for Solr documents.
@ -507,6 +513,78 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
return list.iterator();
}
/**
* Extracts icon entries from this solr document
* @return icon entries collection eventually empty
*/
public final Collection<IconEntry> getIcons() {
Collection<IconEntry> icons = new ArrayList<>();
List<?> iconsUrlStubsList = getFieldValuesAsList(CollectionSchema.icons_urlstub_sxt.getSolrFieldName());
if (iconsUrlStubsList != null) {
List<String> ports = CollectionConfiguration.indexedList2protocolList(
getFieldValues(CollectionSchema.icons_protocol_sxt.getSolrFieldName()), iconsUrlStubsList.size());
List<?> allSizes = getFieldValuesAsList(CollectionSchema.icons_sizes_sxt.getSolrFieldName());
List<?> allRels = getFieldValuesAsList(CollectionSchema.icons_rel_sxt.getSolrFieldName());
Object item;
for (int index = 0; index < iconsUrlStubsList.size(); index++) {
item = iconsUrlStubsList.get(index);
String urlStub = null;
if (item instanceof String) {
urlStub = (String) item;
String iconURLStr = (ports != null && ports.size() > index ? ports.get(index) : "http") + "://" + urlStub;
DigestURL iconURL;
try {
iconURL = new DigestURL(iconURLStr);
} catch (MalformedURLException e) {
continue;
}
Set<String> rels = null;
if (allRels.size() > index) {
item = allRels.get(index);
if (item instanceof String) {
rels = ContentScraper.parseSpaceSeparatedTokens((String) item);
}
}
/* This may happen when icons_rel_sxt field has been disabled in solr schema */
if(rels == null) {
rels = new HashSet<>();
rels.add("unknown");
}
Set<Dimension> sizes = null;
if (allSizes.size() > index) {
item = allSizes.get(index);
if (item instanceof String) {
sizes = ContentScraper.parseSizes((String) item);
}
}
icons.add(new IconEntry(iconURL, rels, sizes));
}
}
}
return icons;
}
/**
* @param name field name
* @return field values from field name eventually immutable empty list when field has no values or is not a List
*/
public List<?> getFieldValuesAsList(String name) {
Collection<Object> fieldValues = getFieldValues(name);
List<?> list;
if (fieldValues instanceof List<?>) {
list = (List<?>) fieldValues;
} else {
list = Collections.EMPTY_LIST;
}
return list;
}
public static Date getDate(SolrDocument doc, final CollectionSchema key) {
Date x = doc == null ? null : (Date) doc.getFieldValue(key.getSolrFieldName());
Date now = new Date();

@ -48,6 +48,13 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.eclipse.jetty.util.ConcurrentHashSet;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
@ -88,6 +95,7 @@ import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -100,13 +108,6 @@ import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.eclipse.jetty.util.ConcurrentHashSet;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -543,6 +544,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final Object parser = document.getParserObject();
boolean containsCanonical = false;
DigestURL canonical = null;
processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values());
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
List<ImageEntry> images = html.getImages();
@ -666,45 +670,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);
// images
final ArrayList<String> imgprots = new ArrayList<String>(images.size());
final Integer[] imgheights = new Integer[images.size()];
final Integer[] imgwidths = new Integer[images.size()];
final Integer[] imgpixels = new Integer[images.size()];
final String[] imgstubs = new String[images.size()];
final String[] imgalts = new String[images.size()];
int withalt = 0;
int i = 0;
LinkedHashSet<String> images_text_map = new LinkedHashSet<String>();
for (final ImageEntry ie: images) {
final MultiProtocolURL uri = ie.url();
inboundLinks.remove(uri);
outboundLinks.remove(uri);
imgheights[i] = ie.height();
imgwidths[i] = ie.width();
imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width();
String protocol = uri.getProtocol();
imgprots.add(protocol);
imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt();
for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it);
if (ie.alt() != null && ie.alt().length() > 0) {
SentenceReader sr = new SentenceReader(ie.alt());
while (sr.hasNext()) images_text_map.add(sr.next().toString());
withalt++;
}
i++;
}
StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1);
for (String s: images_text_map) images_text.append(s.trim()).append(' ');
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts);
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights);
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim());
processImages(doc, allAttr, inboundLinks, outboundLinks, images);
// style sheets
if (allAttr || contains(CollectionSchema.css_tag_sxt)) {
@ -1031,6 +997,116 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return doc;
}
/**
* Add icons metadata to Solr doc when corresponding schema attributes are
* enabled. Remove images urls from inboudLinks and outboundLinks.
*
* @param doc
* solr document to fill
* @param allAttr
* all attributes are enabled
* @param inboundLinks
* all document inbound links
* @param outboundLinks
* all document outbound links
* @param icons
* document icon entries
*/
private void processIcons(SolrVector doc, boolean allAttr, LinkedHashMap<DigestURL, String> inboundLinks,
LinkedHashMap<DigestURL, String> outboundLinks, Collection<IconEntry> icons) {
final List<String> protocols = new ArrayList<String>(icons.size());
final String[] sizes = new String[icons.size()];
final String[] stubs = new String[icons.size()];
final String[] rels = new String[icons.size()];
int i = 0;
/* Prepare solr field values */
for (final IconEntry ie : icons) {
final DigestURL url = ie.getUrl();
inboundLinks.remove(url);
outboundLinks.remove(url);
String protocol = url.getProtocol();
protocols.add(protocol);
/*
* There may be multiple sizes and multiple rels for one icon : we
* store this as flat string as currently solr doesn't support
* multidimensionnal array fields
*/
sizes[i] = ie.sizesToString();
stubs[i] = url.toString().substring(protocol.length() + 3);
rels[i] = ie.relToString();
i++;
}
if (allAttr || contains(CollectionSchema.icons_protocol_sxt)) {
add(doc, CollectionSchema.icons_protocol_sxt, protocolList2indexedList(protocols));
}
if (allAttr || contains(CollectionSchema.icons_urlstub_sxt)) {
add(doc, CollectionSchema.icons_urlstub_sxt, stubs);
}
if (allAttr || contains(CollectionSchema.icons_rel_sxt)) {
add(doc, CollectionSchema.icons_rel_sxt, rels);
}
if (allAttr || contains(CollectionSchema.icons_sizes_sxt)) {
add(doc, CollectionSchema.icons_sizes_sxt, sizes);
}
}
/**
* Add images metadata to Solr doc when corresponding schema attributes are enabled.
* Remove images urls from inboudLinks and outboundLinks.
* @param doc solr document to fill
* @param allAttr all attributes are enabled
* @param inboundLinks all document inbound links
* @param outboundLinks all document outbound links
* @param images document images
*/
private void processImages(SolrVector doc, boolean allAttr, LinkedHashMap<DigestURL, String> inboundLinks,
LinkedHashMap<DigestURL, String> outboundLinks, List<ImageEntry> images) {
final ArrayList<String> imgprots = new ArrayList<String>(images.size());
final Integer[] imgheights = new Integer[images.size()];
final Integer[] imgwidths = new Integer[images.size()];
final Integer[] imgpixels = new Integer[images.size()];
final String[] imgstubs = new String[images.size()];
final String[] imgalts = new String[images.size()];
int withalt = 0;
int i = 0;
LinkedHashSet<String> images_text_map = new LinkedHashSet<String>();
/* Prepare flat solr field values */
for (final ImageEntry ie: images) {
final MultiProtocolURL uri = ie.url();
inboundLinks.remove(uri);
outboundLinks.remove(uri);
imgheights[i] = ie.height();
imgwidths[i] = ie.width();
imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width();
String protocol = uri.getProtocol();
imgprots.add(protocol);
imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt();
for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it);
if (ie.alt() != null && ie.alt().length() > 0) {
SentenceReader sr = new SentenceReader(ie.alt());
while (sr.hasNext()) images_text_map.add(sr.next().toString());
withalt++;
}
i++;
}
StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1);
for (String s: images_text_map) images_text.append(s.trim()).append(' ');
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts);
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights);
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim());
}
/**
* attach additional information to the document to enable navigation features
* @param doc the document to be enriched
@ -1937,14 +2013,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return a;
}
/**
* Uncompress indexed iplist of protocol names to a list of specified dimension.
* @param iplist indexed list typically produced by protocolList2indexedList
* @param dimension size of target list
* @return a list of protocol names
*/
public static List<String> indexedList2protocolList(Collection<Object> iplist, int dimension) {
List<String> a = new ArrayList<String>(dimension);
for (int i = 0; i < dimension; i++) a.add("http");
if (iplist == null) return a;
for (Object ip : iplist) {
// ip format is 001-https but can be 4 digits 1011-https
int i = ((String) ip).indexOf('-');
a.set(Integer.parseInt(((String) ip).substring(0, i)), ((String) ip).substring(i+1));
String indexedProtocol = ((String) ip);
int i = indexedProtocol.indexOf('-');
/* Silently ignore badly formatted entry */
if(i > 0 && indexedProtocol.length() > (i + 1)) {
a.set(Integer.parseInt(indexedProtocol.substring(0, i)), indexedProtocol.substring(i+1));
}
}
return a;
}

@ -139,6 +139,12 @@ public enum CollectionSchema implements SchemaDeclaration {
outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "external links, the url only without the protocol"),
outboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "external links, the visible anchor text"),
icons_urlstub_sxt(SolrType.string, true, true, true, false, true, "all icon links without the protocol and '://'"),
/** All icon links protocols : split from icons_urlstub to provide some compression, as http protocol is implied as default and not stored */
icons_protocol_sxt(SolrType.string, true, true, true, false, false, "all icon links protocols"),
icons_rel_sxt(SolrType.string, true, true, true, false, false, "all icon links relationships space separated (e.g.. 'icon apple-touch-icon')"),
icons_sizes_sxt(SolrType.num_integer, true, true, true, false, false, "all icon sizes space separated (e.g. '16x16 32x32')"),
images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"),
images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols"),

@ -0,0 +1,125 @@
/**
* ContentScraperTest
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.util.Set;
import org.junit.Assert;
import org.junit.Test;
/**
* Unit tests for ContentScrapper class.
* @author luc
*
*/
public class ContentScraperTest {
@Test
public final void testParseSizes() {
/* Normal case */
Set<Dimension> sizes = ContentScraper.parseSizes("96x128");
Assert.assertEquals(1, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
/* "any" keyword */
sizes = ContentScraper.parseSizes("any");
Assert.assertEquals(0, sizes.size());
/* Multiple valid sizes, lower and upper case separator */
sizes = ContentScraper.parseSizes("96x128 16X16 1X2 1024x768");
Assert.assertEquals(4, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(16, 16)));
Assert.assertTrue(sizes.contains(new Dimension(1, 2)));
Assert.assertTrue(sizes.contains(new Dimension(1024, 768)));
/* Duplicate entries */
sizes = ContentScraper.parseSizes("96x128 96X128 1X2 96x128");
Assert.assertEquals(2, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(1, 2)));
/* Mutiple inner and trailing spaces */
sizes = ContentScraper.parseSizes(" 96x128 16X16 ");
Assert.assertEquals(2, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(16, 16)));
/* Empty string */
sizes = ContentScraper.parseSizes("");
Assert.assertEquals(0, sizes.size());
/* null string */
sizes = ContentScraper.parseSizes(null);
Assert.assertEquals(0, sizes.size());
/* Invalid sizes */
sizes = ContentScraper.parseSizes("096x0128 -16x-16 0x0 x768 78x axb 1242");
Assert.assertEquals(0, sizes.size());
/* Mix of valid and invalid sizes */
sizes = ContentScraper.parseSizes("96x128 16X16 axb 123 78x32");
Assert.assertEquals(3, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(16, 16)));
Assert.assertTrue(sizes.contains(new Dimension(78, 32)));
}
@Test
public final void testParseSpaceSeparatedTokens() {
/* Normal case */
Set<String> tokens = ContentScraper.parseSpaceSeparatedTokens("abc de");
Assert.assertEquals(2, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
Assert.assertTrue(tokens.contains("de"));
/* One item only */
tokens = ContentScraper.parseSpaceSeparatedTokens("abc");
Assert.assertEquals(1, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
/* Mutiple inner and trailing spaces */
tokens = ContentScraper.parseSpaceSeparatedTokens(" abc d efff fgj ");
Assert.assertEquals(4, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
Assert.assertTrue(tokens.contains("d"));
Assert.assertTrue(tokens.contains("efff"));
Assert.assertTrue(tokens.contains("fgj"));
/* Duplicate entries */
tokens = ContentScraper.parseSpaceSeparatedTokens("abc bb abc abc ABC");
Assert.assertEquals(3, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
/* ignoring case is not the purpose of this function */
Assert.assertTrue(tokens.contains("ABC"));
Assert.assertTrue(tokens.contains("bb"));
/* Empty string */
tokens = ContentScraper.parseSpaceSeparatedTokens("");
Assert.assertEquals(0, tokens.size());
/* Null string */
tokens = ContentScraper.parseSpaceSeparatedTokens(null);
Assert.assertEquals(0, tokens.size());
}
}

@ -0,0 +1,192 @@
/**
* IconEntryTest
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import org.junit.Assert;
import org.junit.Test;
import net.yacy.cora.document.id.DigestURL;
/**
* Unit tests for IconEntry class.
* @author luc
*
*/
public class IconEntryTest {
@Test
public final void testGetDistance() {
/* Normal case : one size has both width and height greater */
Dimension size1 = new Dimension(5, 8);
Dimension size2 = new Dimension(7, 12);
Assert.assertEquals(3.0, IconEntry.getDistance(size1, size2), 0.0);
/* Check inverted parameters should produces same result */
Assert.assertEquals(3.0, IconEntry.getDistance(size2, size1), 0.0);
/* Equal sizes */
size2 = new Dimension(5, 8);
Assert.assertEquals(0.0, IconEntry.getDistance(size1, size2), 0.0);
/* Equal sizes */
size2 = new Dimension(5, 8);
Assert.assertEquals(0.0, IconEntry.getDistance(size1, size2), 0.0);
/* Only one dimension differs */
size2 = new Dimension(5, 12);
Assert.assertEquals(2.0, IconEntry.getDistance(size1, size2), 0.0);
size2 = new Dimension(10, 8);
Assert.assertEquals(2.5, IconEntry.getDistance(size1, size2), 0.0);
/* width lower, height upper */
size2 = new Dimension(3, 12);
Assert.assertEquals(3.0, IconEntry.getDistance(size1, size2), 0.0);
/* negative values */
size1 = new Dimension(-5, -8);
size2 = new Dimension(-7, -12);
Assert.assertEquals(3.0, IconEntry.getDistance(size1, size2), 0.0);
/* one null */
size1 = null;
size2 = new Dimension(-7, -12);
Assert.assertEquals(Double.MAX_VALUE, IconEntry.getDistance(size1, size2), 0.0);
}
@Test
public final void testGetClosestSize() throws MalformedURLException {
/* Preferred size in sizes set */
Set<String> rels = new HashSet<>();
rels.add(IconLinkRelations.ICON.getRelValue());
Set<Dimension> sizes = new HashSet<>();
sizes.add(new Dimension(128,128));
sizes.add(new Dimension(256,512));
sizes.add(new Dimension(16,16));
Dimension preferredSize = new Dimension(16, 16);
IconEntry icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
Dimension result = icon.getClosestSize(preferredSize);
Assert.assertEquals(preferredSize, result);
/* Preferred size lower than all sizes in set */
preferredSize = new Dimension(12, 12);
result = icon.getClosestSize(preferredSize);
Assert.assertEquals(new Dimension(16,16), result);
/* Preferred size over than all sizes in set */
preferredSize = new Dimension(1992, 1024);
result = icon.getClosestSize(preferredSize);
Assert.assertEquals(new Dimension(256, 512), result);
/* Preferred size between sizes in set */
preferredSize = new Dimension(17, 18);
result = icon.getClosestSize(preferredSize);
Assert.assertEquals(new Dimension(16, 16), result);
/* Sizes set contains only one item */
sizes = new HashSet<>();
sizes.add(new Dimension(128,128));
icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
preferredSize = new Dimension(1992, 1024);
result = icon.getClosestSize(preferredSize);
Assert.assertEquals(new Dimension(128, 128), result);
/* Empty sizes set */
sizes = new HashSet<>();
icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
preferredSize = new Dimension(16, 16);
result = icon.getClosestSize(preferredSize);
Assert.assertNull(result);
/* Null preferred size */
sizes = new HashSet<>();
sizes.add(new Dimension(128,128));
sizes.add(new Dimension(256,512));
sizes.add(new Dimension(16,16));
icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
preferredSize = null;
result = icon.getClosestSize(preferredSize);
Assert.assertNull(result);
}
@Test
public final void testSizesToString() throws MalformedURLException {
/* Multiple values in sizes set */
Set<String> rels = new HashSet<>();
rels.add(IconLinkRelations.ICON.getRelValue());
Set<Dimension> sizes = new HashSet<>();
sizes.add(new Dimension(128,128));
sizes.add(new Dimension(256,512));
sizes.add(new Dimension(16,16));
IconEntry icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
String sizesStr = icon.sizesToString();
/* The set is not ordered, only check result contains what we expect */
Assert.assertTrue(sizesStr.contains("128x128"));
Assert.assertTrue(sizesStr.contains("256x512"));
Assert.assertTrue(sizesStr.contains("16x16"));
Assert.assertTrue(sizesStr.contains(" "));
/* One value in sizes set */
sizes = new HashSet<>();
sizes.add(new Dimension(128,128));
icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
sizesStr = icon.sizesToString();
Assert.assertEquals("128x128", sizesStr);
/* Empty sizes set */
sizes = new HashSet<>();
icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
sizesStr = icon.sizesToString();
Assert.assertTrue(sizesStr.isEmpty());
}
@Test
public final void testRelToString() throws MalformedURLException {
/* Multiple values in rel set */
Set<String> rels = new HashSet<>();
rels.add(IconLinkRelations.ICON.getRelValue());
rels.add(IconLinkRelations.APPLE_TOUCH_ICON.getRelValue());
rels.add(IconLinkRelations.MASK_ICON.getRelValue());
Set<Dimension> sizes = new HashSet<>();
sizes.add(new Dimension(128,128));
IconEntry icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
String relStr = icon.relToString();
/* The set is not ordered, only check result contains what we expect */
Assert.assertTrue(relStr.contains(IconLinkRelations.ICON.getRelValue()));
Assert.assertTrue(relStr.contains(IconLinkRelations.APPLE_TOUCH_ICON.getRelValue()));
Assert.assertTrue(relStr.contains(IconLinkRelations.MASK_ICON.getRelValue()));
Assert.assertTrue(relStr.contains(" "));
/* One value in rel set */
rels = new HashSet<>();
rels.add(IconLinkRelations.ICON.getRelValue());
icon = new IconEntry(new DigestURL("http://yacy.net"), rels, sizes);
relStr = icon.relToString();
Assert.assertEquals(IconLinkRelations.ICON.getRelValue(), relStr);
}
}

@ -0,0 +1,157 @@
/**
* URIMetadataNodeTest
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import java.awt.Dimension;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
/**
* Unit tests for URIMetadataNode class.
*
* @author luc
*
*/
public class URIMetadataNodeTest {
/**
* Three standard icons with diferrent sizes, one non-standard with a larger
* size
*/
@Test
public final void testGetIcons4Items() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode
.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "somehost.org/static/images/icon16.png", "somehost.org/static/images/icon32.png",
"somehost.org/static/images/icon64.png",
"somehost.org/static/images/iconApple128.png" });
List<String> protocols = CollectionConfiguration
.protocolList2indexedList(Arrays.asList(new String[] { "http", "https", "https", "http" }));
metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols);
metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(),
new String[] { "icon", "icon", "icon", "apple-touch-icon" });
metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(),
new String[] { "16x24", "32x32", "58x64", "128x128" });
Collection<IconEntry> icons = metadataNode.getIcons();
int nb = 0;
/* Check results consistency */
for(IconEntry icon : icons) {
if("http://somehost.org/static/images/icon16.png".equals(icon.getUrl().toNormalform(false))) {
Assert.assertEquals(1, icon.getSizes().size());
Dimension size = icon.getSizes().iterator().next();
Assert.assertEquals(16, size.width);
Assert.assertEquals(24, size.height);
Assert.assertEquals(1, icon.getRel().size());
Assert.assertEquals("icon", icon.getRel().iterator().next());
nb++;
} else if("https://somehost.org/static/images/icon32.png".equals(icon.getUrl().toNormalform(false))) {
Assert.assertEquals(1, icon.getSizes().size());
Dimension size = icon.getSizes().iterator().next();
Assert.assertEquals(32, size.width);
Assert.assertEquals(32, size.height);
Assert.assertEquals(1, icon.getRel().size());
Assert.assertEquals("icon", icon.getRel().iterator().next());
nb++;
} else if("https://somehost.org/static/images/icon64.png".equals(icon.getUrl().toNormalform(false))) {
Assert.assertEquals(1, icon.getSizes().size());
Dimension size = icon.getSizes().iterator().next();
Assert.assertEquals(58, size.width);
Assert.assertEquals(64, size.height);
Assert.assertEquals(1, icon.getRel().size());
Assert.assertEquals("icon", icon.getRel().iterator().next());
nb++;
} else if("http://somehost.org/static/images/iconApple128.png".equals(icon.getUrl().toNormalform(false))) {
Assert.assertEquals(1, icon.getSizes().size());
Dimension size = icon.getSizes().iterator().next();
Assert.assertEquals(128, size.width);
Assert.assertEquals(128, size.height);
Assert.assertEquals(1, icon.getRel().size());
Assert.assertEquals("apple-touch-icon", icon.getRel().iterator().next());
nb++;
}
}
Assert.assertEquals(4, nb);
}
/**
* Only icons_urlstub_sxt field valued
*/
@Test
public final void testGetIconsOnlyIconsUrlstubSxt() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode
.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "somehost.org/static/images/icon16.png", "somehost.org/static/images/icon32.png",
"somehost.org/static/images/icon64.png",
"somehost.org/static/images/iconApple124.png" });
Collection<IconEntry> icons = metadataNode.getIcons();
Assert.assertEquals(4, icons.size());
}
/**
* Only one standard icon
*/
@Test
public final void testGetIcons1Item() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "somehost.org/static/images/icon16.png" });
List<String> protocols = CollectionConfiguration
.protocolList2indexedList(Arrays.asList(new String[] { "http" }));
metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols);
metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(), new String[] { "icon" });
metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(), new String[] { "16x16" });
Collection<IconEntry> icons = metadataNode.getIcons();
Assert.assertEquals(1, icons.size());
IconEntry icon = icons.iterator().next();
Assert.assertEquals(1, icon.getSizes().size());
Dimension size = icon.getSizes().iterator().next();
Assert.assertEquals(16.0, size.getWidth(), 0.0);
Assert.assertEquals(16.0, size.getHeight(), 0.0);
}
/**
* No Icon
*/
@Test
public final void testGetIconsNoIcon() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
Collection<IconEntry> icons = metadataNode.getIcons();
Assert.assertEquals(0, icons.size());
}
}

@ -0,0 +1,187 @@
/**
* URIMetadataNodeTest
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.awt.Dimension;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
/**
* Unit tests for yacysearchitem class.
*
* @author luc
*
*/
public class yacysearchitemTest {
/**
* Three standard icons with diferrent sizes, one non-standard with a larger
* size
*
* @throws MalformedURLException
*/
@Test
public final void testGetFaviconURL() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode
.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "someHost.org/static/images/icon16.png", "somehost.org/static/images/icon32.png",
"somehost.org/static/images/icon64.png",
"somehost.org/static/images/iconApple124.png" });
List<String> protocols = CollectionConfiguration
.protocolList2indexedList(Arrays.asList(new String[] { "http", "http", "http", "http" }));
metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols);
metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(),
new String[] { "icon", "icon", "icon", "apple-touch-icon" });
metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(),
new String[] { "16x16", "32x32", "64x64", "128x128" });
/* Search for a size present in icons collection */
DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(32, 32));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/icon32.png", faviconURL.toNormalform(false));
/* Search for a size not in icons collection */
faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(40, 40));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/icon32.png", faviconURL.toNormalform(false));
/*
* Search for a size equals to non-standard : standard icon is stil
* preffered
*/
faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(128, 128));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/icon64.png", faviconURL.toNormalform(false));
}
/**
* Only non-standard icons
*
* @throws MalformedURLException
*/
@Test
public final void testGetFaviconURLNonStandard() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode
.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "somehost.org/static/images/mask32.png",
"somehost.org/static/images/fluid.64.png",
"somehost.org/static/images/iconApple124.png" });
List<String> protocols = CollectionConfiguration
.protocolList2indexedList(Arrays.asList(new String[] { "http", "http", "http" }));
metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols);
metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(),
new String[] { "mask-icon", "fluid-icon", "apple-touch-icon" });
metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(),
new String[] { "32x32", "64x64", "128x128" });
/* Non standard icon is returned as fallback */
DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(32, 32));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/mask32.png", faviconURL.toNormalform(false));
}
/**
* One standard icon with multiple sizes
*
* @throws MalformedURLException
*/
@Test
public final void testGetFaviconURLMultiSizes() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode
.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "somehost.org/static/images/favicon.ico"});
List<String> protocols = CollectionConfiguration
.protocolList2indexedList(Arrays.asList(new String[] { "http"}));
metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols);
metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(),
new String[] { "icon"});
metadataNode.setField(CollectionSchema.icons_sizes_sxt.getSolrFieldName(),
new String[] { "16x16 32x32 64x64",});
/* Search for a size in sizes set */
DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(32, 32));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/favicon.ico", faviconURL.toNormalform(false));
/* Search for a size not in sizes set */
faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(40, 40));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/favicon.ico", faviconURL.toNormalform(false));
}
/**
* One standard icon with no size
*
* @throws MalformedURLException
*/
@Test
public final void testGetFaviconURLNoSize() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://somehost.org"));
metadataNode
.setField(CollectionSchema.icons_urlstub_sxt.getSolrFieldName(),
new String[] { "somehost.org/static/images/favicon.ico"});
List<String> protocols = CollectionConfiguration
.protocolList2indexedList(Arrays.asList(new String[] { "http"}));
metadataNode.setField(CollectionSchema.icons_protocol_sxt.getSolrFieldName(), protocols);
metadataNode.setField(CollectionSchema.icons_rel_sxt.getSolrFieldName(),
new String[] { "icon"});
DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(32, 32));
Assert.assertNotNull(faviconURL);
Assert.assertEquals("http://somehost.org/static/images/favicon.ico", faviconURL.toNormalform(false));
}
/**
* No icon in document
*
* @throws MalformedURLException
*/
@Test
public final void testGetFaviconURLNoIcon() throws MalformedURLException {
URIMetadataNode metadataNode = new URIMetadataNode(new DigestURL("http://someHost.org"));
/* Default fallback favicon URL should be generated */
DigestURL faviconURL = yacysearchitem.getFaviconURL(false, RequestHeader.FileType.HTML, metadataNode,
new Dimension(32, 32));
Assert.assertEquals("http://somehost.org/favicon.ico", faviconURL.toNormalform(false));
}
}
Loading…
Cancel
Save