diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index 921e7c7b5..32888e9c2 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -50,9 +50,6 @@ fuzzy_signature_unique_b
## the size of the raw source (mandatory field)
size_i
-## index creation comment (mandatory field)
-process_s
-
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
@@ -71,6 +68,10 @@ references_i
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
clickdepth_i
+## needed (post-)processing steps on this metadata set
+process_sxt
+
+
### optional but highly recommended values, part of the index distribution process
## time when resource was loaded
diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html
index 391446f21..0dc590003 100644
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@@ -128,7 +128,7 @@ function updatepage(str) {
#[url]# |
#(stored)#
#(load)#link, detected from context | ::load & index#(/load)# | ::
- indexed | ::
+ indexed | #[comment]# | ::
loading | ::
#[error]# |
#(/stored)#
diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java
index a68a7dc52..edba7f5e0 100644
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@@ -47,6 +47,8 @@ import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.RowHandleMap;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry;
import net.yacy.search.Switchboard;
@@ -251,13 +253,15 @@ public class HostBrowser {
YaCySchema.inboundlinks_protocol_sxt.getSolrFieldName(),
YaCySchema.inboundlinks_urlstub_txt.getSolrFieldName(),
YaCySchema.outboundlinks_protocol_sxt.getSolrFieldName(),
- YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName()
+ YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName(),
+ YaCySchema.clickdepth_i.getSolrFieldName()
);
SolrDocument doc;
Set storedDocs = new HashSet();
Map errorDocs = new HashMap();
Set inboundLinks = new HashSet();
Map> outboundHosts = new HashMap>();
+ RowHandleMap clickdepth = new RowHandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1, 100, "clickdepth");
int hostsize = 0;
final List deleteIDs = new ArrayList();
long timeout = System.currentTimeMillis() + TIMEOUT;
@@ -265,6 +269,8 @@ public class HostBrowser {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
String errortype = (String) doc.getFieldValue(YaCySchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
+ Integer cd = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName());
+ if (cd != null) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue());
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
@@ -399,6 +405,10 @@ public class HostBrowser {
boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
if (!dc) {
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/);
+ if (type == StoreType.INDEX) {
+ long cd = clickdepth.get(uri.hash());
+ prop.put("files_list_" + c + "_type_stored_comment", cd >= 0 ? "clickdepth = " + cd : "");
+ }
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
if (error) {
FailType failType = errorDocs.get(entry.getKey());
diff --git a/source/net/yacy/cora/federate/solr/ProcessType.java b/source/net/yacy/cora/federate/solr/ProcessType.java
new file mode 100644
index 000000000..29365708a
--- /dev/null
+++ b/source/net/yacy/cora/federate/solr/ProcessType.java
@@ -0,0 +1,31 @@
+/**
+ * ProcessType
+ * Copyright 2013 by Michael Peter Christen
+ * First released 02.01.2013 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+
+package net.yacy.cora.federate.solr;
+
+/**
+ * this enum class is used to define (post-) process steps that are attached at the solr dataset in the field process_s
+ */
+public enum ProcessType {
+
+ CLICKDEPTH;
+
+}
diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java
index 9691f51fb..69f5b1e99 100644
--- a/source/net/yacy/cora/federate/solr/YaCySchema.java
+++ b/source/net/yacy/cora/federate/solr/YaCySchema.java
@@ -42,14 +42,14 @@ public enum YaCySchema implements Schema {
fuzzy_signature_text_t(SolrType.text_general, true, true, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
fuzzy_signature_unique_b(SolrType.bool, true, true, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
- process_s(SolrType.string, true, true, false, "index creation comment"),
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
references_i(SolrType.num_integer, true, true, false, "number of unique http references; used for ranking"),
clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
-
+ process_sxt(SolrType.string, true, true, true, "needed (post-)processing steps on this metadata set"),
+
// optional but recommended, part of index distribution
load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, false, "date until resource shall be considered as fresh"),
diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java
index ab124716e..a43674d29 100644
--- a/source/net/yacy/kelondro/data/meta/DigestURI.java
+++ b/source/net/yacy/kelondro/data/meta/DigestURI.java
@@ -32,6 +32,7 @@ import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
+import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@@ -279,10 +280,18 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
private static final char rootURLFlag0 = subdomPortPath("", 80, "");
private static final char rootURLFlag1 = subdomPortPath("www", 80, "");
+ private static final char rootURLFlag2 = subdomPortPath("", 21, "");
+ private static final char rootURLFlag3 = subdomPortPath("ftp", 21, "");
+
+ public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php");
+
+ public final boolean probablyRootURL() {
+ return this.path.length() == 0 || rootPattern.matcher(this.path).matches() || probablyRootURL(this.hash);
+ }
public static final boolean probablyRootURL(final byte[] urlHash) {
- final char c = (char) urlHash[5];
- return c == rootURLFlag0 || c == rootURLFlag1;
+ final char c = (char) urlHash[5];
+ return c == rootURLFlag0 || c == rootURLFlag1 || c == rootURLFlag2 || c == rootURLFlag3;
}
private static final String hosthash5(final String protocol, final String host, final int port) {
diff --git a/source/net/yacy/kelondro/util/ByteBuffer.java b/source/net/yacy/kelondro/util/ByteBuffer.java
index dbe868061..e4c6d2681 100644
--- a/source/net/yacy/kelondro/util/ByteBuffer.java
+++ b/source/net/yacy/kelondro/util/ByteBuffer.java
@@ -226,6 +226,12 @@ public final class ByteBuffer extends OutputStream {
return true;
}
+ public static boolean equals(final byte[] b0, final int off0, final byte[] b1, final int off1, final int length) {
+ if (b0.length - off0 < length || b1.length - off1 < length) return false;
+ for (int i = 0; i < length; i++) if (b0[off0 + i] != b1[off1 + i]) return false;
+ return true;
+ }
+
public void writeTo(final OutputStream dest) throws IOException {
dest.write(this.buffer, this.offset, this.length);
dest.flush();
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index c7fb42215..57e5d76ae 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -363,7 +363,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
- final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language);
+ final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex);
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
for (YaCySchema[] checkfields: new YaCySchema[][]{
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index b68330659..099967c28 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -34,32 +34,39 @@ import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
-import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.FailType;
+import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.ConfigurationSet;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
+import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
+import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.rwi.IndexCell;
+import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
+import net.yacy.kelondro.util.ByteBuffer;
import org.apache.solr.common.SolrInputDocument;
@@ -306,23 +313,40 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
text = text.trim();
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
-
- private final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php");
- protected SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) {
+ protected SolrInputDocument yacy2solr(
+ final String id, final CrawlProfile profile, final ResponseHeader responseHeader,
+ final Document document, Condenser condenser, DigestURI referrerURL, String language,
+ IndexCell citations) {
// we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source());
boolean allAttr = this.isEmpty();
+
+ Set processTypes = new LinkedHashSet();
+
add(doc, YaCySchema.id, id);
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
String docurl = digestURI.toNormalform(true);
add(doc, YaCySchema.sku, docurl);
if (allAttr || contains(YaCySchema.clickdepth_i)) {
- String path = digestURI.getPath();
- boolean fronturl = path.length() == 0 || rootPattern.matcher(path).matches();
- add(doc, YaCySchema.clickdepth_i, fronturl ? 0 : -1);
+ boolean fronturl = digestURI.probablyRootURL();
+ if (fronturl) {
+ add(doc, YaCySchema.clickdepth_i, 0);
+ } else {
+ // search the citations for references
+ int clickdepth = -1;
+ try {
+ clickdepth = getClickDepth(citations, digestURI.hash());
+ } catch (IOException e) {
+ add(doc, YaCySchema.clickdepth_i, -1);
+ }
+ add(doc, YaCySchema.clickdepth_i, clickdepth);
+ if (clickdepth < 0 || clickdepth > 1) {
+ processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
+ }
+ }
}
if (allAttr || contains(YaCySchema.ip_s)) {
@@ -800,10 +824,71 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
Set facetValues = facet.getValue();
doc.setField(YaCySchema.VOCABULARY_PREFIX + facetName + YaCySchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()]));
}
-
+
+ if (allAttr || contains(YaCySchema.process_sxt)) {
+ List p = new ArrayList();
+ for (ProcessType t: processTypes) p.add(t.name());
+ add(doc, YaCySchema.process_sxt, p);
+ }
return doc;
}
+ /**
+ * compute the click level using the citation reference database
+ * @param citations the citation database
+ * @param searchhash the hash of the url to be checked
+ * @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
+ * @throws IOException
+ */
+ private int getClickDepth(final IndexCell citations, byte[] searchhash) throws IOException {
+
+ RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
+ RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
+ try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
+ int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
+ final byte[] hosthash = new byte[6]; // the host of the url to be checked
+ System.arraycopy(searchhash, 6, hosthash, 0, 6);
+
+ long timeout = System.currentTimeMillis() + 10000;
+ for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) {
+
+ RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
+
+ // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
+ checkloop: for (byte[] urlhash: levelhashes) {
+
+ // get all the citations for this url and iterate
+ ReferenceContainer references = citations.get(urlhash, null);
+ if (references == null || references.size() == 0) continue checkloop; // don't know
+ Iterator i = references.entries();
+ nextloop: while (i.hasNext()) {
+ CitationReference ref = i.next();
+ if (ref == null) continue nextloop;
+ byte[] u = ref.urlhash();
+
+ // check ignore
+ if (ignore.has(u)) continue nextloop;
+
+ // check if this is from the same host
+ if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
+
+ // check if the url is a root url
+ if (DigestURI.probablyRootURL(u)) {
+ return leveldepth + 1;
+ }
+
+ // step to next depth level
+ try {checknext.put(u);} catch (SpaceExceededException e) {}
+ try {ignore.put(u);} catch (SpaceExceededException e) {}
+ }
+ }
+ leveldepth++;
+ levelhashes = checknext;
+
+ }
+ return -1;
+ }
+
/**
* this method compresses a list of protocol names to an indexed list.
* To do this, all 'http' entries are removed and considered as default.