From eac9650b31955f85adbdc16f987966a3e568809d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 18 Dec 2012 17:20:42 +0100 Subject: [PATCH] added another solr field clickdepth_i which reflects the number of clicks which are necessary to get from the portal of a host to a specific document. At this time, only the start document is flagged with clickdepth '0', all other with '-1'. To get the actual clickdepth, a process must use crawled information to collect the actual number of clicks. This will be added in another/next step. --- defaults/solr.keys.list | 3 + .../yacy/cora/federate/solr/YaCySchema.java | 1 + .../yacy/search/index/SolrConfiguration.java | 70 +++++++++++-------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 78c51beb3..921e7c7b5 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -68,6 +68,9 @@ httpstatus_i ## number of unique http references; used for ranking references_i +## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url +clickdepth_i + ### optional but highly recommended values, part of the index distribution process ## time when resource was loaded diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 2436f5352..8216cfd88 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -48,6 +48,7 @@ public enum YaCySchema implements Schema { httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), references_i(SolrType.num_integer, true, true, false, "number of unique http references; used for ranking"), + clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), // optional but recommended, part of index distribution load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"), diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index a769cb4ae..516b590f3 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -38,6 +38,7 @@ import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -306,6 +307,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } + private final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php"); + protected SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) { // we use the SolrCell design as index scheme final SolrInputDocument doc = new SolrInputDocument(); @@ -313,8 +316,15 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable boolean allAttr = this.isEmpty(); add(doc, YaCySchema.id, id); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) - String us = digestURI.toNormalform(true); - add(doc, YaCySchema.sku, us); + String docurl = digestURI.toNormalform(true); + add(doc, YaCySchema.sku, docurl); + + if (allAttr || contains(YaCySchema.clickdepth_i)) { + String path = digestURI.getPath(); + boolean fronturl = path.length() == 0 || rootPattern.matcher(path).matches(); + add(doc, YaCySchema.clickdepth_i, fronturl ? 0 : -1); + } + if (allAttr || contains(YaCySchema.ip_s)) { final InetAddress address = digestURI.getInetAddress(); if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress()); @@ -329,7 +339,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()])); if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()])); } - if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length()); + if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, docurl.length()); String host = null; if ((host = digestURI.getHost()) != null) { String dnc = Domains.getDNC(host); @@ -543,13 +553,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final String[] css_url = new String[csss.size()]; c = 0; for (final Map.Entry entry: csss.entrySet()) { - final String url = entry.getKey().toNormalform(false); - inboundLinks.remove(url); - outboundLinks.remove(url); + final String cssurl = entry.getKey().toNormalform(false); + inboundLinks.remove(cssurl); + outboundLinks.remove(cssurl); css_tag[c] = ""; - css_url[c] = url; + " href=\""+ cssurl + "\" />"; + css_url[c] = cssurl; c++; } add(doc, YaCySchema.csscount_i, css_tag.length); @@ -562,10 +572,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; - for (final MultiProtocolURI url: scriptss) { - inboundLinks.remove(url); - outboundLinks.remove(url); - scripts[c++] = url.toNormalform(false); + for (final MultiProtocolURI u: scriptss) { + inboundLinks.remove(u); + outboundLinks.remove(u); + scripts[c++] = u.toNormalform(false); } add(doc, YaCySchema.scriptscount_i, scripts.length); if (scripts.length > 0) add(doc, YaCySchema.scripts_txt, scripts); @@ -576,10 +586,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; - for (final MultiProtocolURI url: framess) { - inboundLinks.remove(url); - outboundLinks.remove(url); - frames[c++] = url.toNormalform(false); + for (final MultiProtocolURI u: framess) { + inboundLinks.remove(u); + outboundLinks.remove(u); + frames[c++] = u.toNormalform(false); } add(doc, YaCySchema.framesscount_i, frames.length); if (frames.length > 0) add(doc, YaCySchema.frames_txt, frames); @@ -590,10 +600,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; - for (final MultiProtocolURI url: iframess) { - inboundLinks.remove(url); - outboundLinks.remove(url); - iframes[c++] = url.toNormalform(false); + for (final MultiProtocolURI u: iframess) { + inboundLinks.remove(u); + outboundLinks.remove(u); + iframes[c++] = u.toNormalform(false); } add(doc, YaCySchema.iframesscount_i, iframes.length); if (iframes.length > 0) add(doc, YaCySchema.iframes_txt, iframes); @@ -667,13 +677,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final List inboundlinksTextChars = new ArrayList(inboundLinks.size()); final List inboundlinksTextWords = new ArrayList(inboundLinks.size()); final List inboundlinksAltTag = new ArrayList(inboundLinks.size()); - for (final MultiProtocolURI url: inboundLinks) { - final Properties p = alllinks.get(url); + for (final MultiProtocolURI u: inboundLinks) { + final Properties p = alllinks.get(u); if (p == null) continue; final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the tag - final String urls = url.toNormalform(false); + final String urls = u.toNormalform(false); final int pr = urls.indexOf("://",0); inboundlinksURLProtocol.add(urls.substring(0, pr)); inboundlinksURLStub.add(urls.substring(pr + 3)); @@ -683,12 +693,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable inboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); inboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); inboundlinksTag.add( - " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""); - ImageEntry ientry = images.get(url); + ImageEntry ientry = images.get(u); inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); c++; } @@ -715,13 +725,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final List outboundlinksTextWords = new ArrayList(outboundLinks.size()); final List outboundlinksText = new ArrayList(outboundLinks.size()); final List outboundlinksAltTag = new ArrayList(outboundLinks.size()); - for (final MultiProtocolURI url: outboundLinks) { - final Properties p = alllinks.get(url); + for (final MultiProtocolURI u: outboundLinks) { + final Properties p = alllinks.get(u); if (p == null) continue; final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the tag - final String urls = url.toNormalform(false); + final String urls = u.toNormalform(false); final int pr = urls.indexOf("://",0); outboundlinksURLProtocol.add(urls.substring(0, pr)); outboundlinksURLStub.add(urls.substring(pr + 3)); @@ -731,12 +741,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable outboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); outboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); outboundlinksTag.add( - " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""); - ImageEntry ientry = images.get(url); + ImageEntry ientry = images.get(u); inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); c++; }