Preparations to produce a click depth attribute in the search index.

This attribute can be used for ranking and for other purpose (demand by customer) The click depth is computed in two steps: - during indexing the current fill-state of the reverse link index is used to backtrack the current page to the root page. The length of that backtrack is the clickdepth. But this does not discover the shortest click depth. To get this, a second process to check again is needed - added a process tag that can be used to do operations on the existing index after a crawl; i.e. calculation the shortest clickpath. Added a field to control this operation but not a method to operate on this. - added a visualization of the clickpath length in the host browser
12 years ago · 5c0c56cfe1
parent 6861af87e2
commit 5c0c56cfe1
9 changed files with 160 additions and 18 deletions
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@ -50,9 +50,6 @@ fuzzy_signature_unique_b
 ## the size of the raw source (mandatory field)
 size_i

-## index creation comment (mandatory field)
-process_s
-
 ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
 failreason_t

@ -71,6 +68,10 @@ references_i
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
 clickdepth_i

+## needed (post-)processing steps on this metadata set
+process_sxt
+
+
 ### optional but highly recommended values, part of the index distribution process

 ## time when resource was loaded
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@ -128,7 +128,7 @@ function updatepage(str) {
          <td align="left" nowrap class=#(stored)#"listingem"::"listing"#(/stored)#>#[url]#&nbsp;<a href="#[url]#" target="_blank"><img src="/env/grafics/link.gif"/></a></td>
          #(stored)#
          #(load)#<td align="left" colspan="5" nowrap class="listingem">link, detected from context</td>::<td align="left" colspan="5" nowrap class="listingnok"><a href="/HostBrowser.html?load=#[url]#&path=#[path]#">load &amp; index</a>#(/load)#</td>::
-          <td align="left" colspan="5" nowrap class="listingok">indexed</td>::
+          <td align="left" colspan="3" nowrap class="listingok">indexed</td><td align="left" colspan="2" nowrap class="listingok">#[comment]#</td>::
 		  <td align="left" colspan="5" nowrap class="pending">loading</td>::
 		  <td align="left" colspan="5" nowrap class="listingnok">#[error]#</td>
          #(/stored)#
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -47,6 +47,8 @@ import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.RowHandleMap;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.peers.graphics.WebStructureGraph.StructureEntry;
 import net.yacy.search.Switchboard;
@ -251,13 +253,15 @@ public class HostBrowser {
                        YaCySchema.inboundlinks_protocol_sxt.getSolrFieldName(),
                        YaCySchema.inboundlinks_urlstub_txt.getSolrFieldName(),
                        YaCySchema.outboundlinks_protocol_sxt.getSolrFieldName(),
-                        YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName()
+                        YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName(),
+                        YaCySchema.clickdepth_i.getSolrFieldName()
                        );
                SolrDocument doc;
                Set<String> storedDocs = new HashSet<String>();
                Map<String, FailType> errorDocs = new HashMap<String, FailType>();
                Set<String> inboundLinks = new HashSet<String>();
                Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
+                RowHandleMap clickdepth = new RowHandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1, 100, "clickdepth");
                int hostsize = 0;
                final List<byte[]> deleteIDs = new ArrayList<byte[]>();
                long timeout = System.currentTimeMillis() + TIMEOUT;
@ -265,6 +269,8 @@ public class HostBrowser {
                    String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
                    String errortype = (String) doc.getFieldValue(YaCySchema.failtype_s.getSolrFieldName());
                    FailType error = errortype == null ? null : FailType.valueOf(errortype);  
+                    Integer cd = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName());
+                    if (cd != null) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue());
                    if (u.startsWith(path)) {
                        if (delete) {
                            deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
@ -399,6 +405,10 @@ public class HostBrowser {
                        boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
                        if (!dc) {
                            prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/);
+                            if (type == StoreType.INDEX) {
+                                long cd = clickdepth.get(uri.hash());
+                                prop.put("files_list_" + c + "_type_stored_comment", cd >= 0 ? "clickdepth = " + cd : "");
+                            }
                            prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
                            if (error) {
                                FailType failType = errorDocs.get(entry.getKey());
--- a/source/net/yacy/cora/federate/solr/ProcessType.java
+++ b/source/net/yacy/cora/federate/solr/ProcessType.java
@ -0,0 +1,31 @@
+/**
+ *  ProcessType
+ *  Copyright 2013 by Michael Peter Christen
+ *  First released 02.01.2013 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+package net.yacy.cora.federate.solr;
+
+/**
+ * this enum class is used to define (post-) process steps that are attached at the solr dataset in the field process_s
+ */
+public enum ProcessType {
+
+    CLICKDEPTH;
+    
+}
--- a/source/net/yacy/cora/federate/solr/YaCySchema.java
+++ b/source/net/yacy/cora/federate/solr/YaCySchema.java
@ -42,14 +42,14 @@ public enum YaCySchema implements Schema {
    fuzzy_signature_text_t(SolrType.text_general, true, true, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
    fuzzy_signature_unique_b(SolrType.bool, true, true, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
    size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
-    process_s(SolrType.string, true, true, false, "index creation comment"),
    failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
    failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
    httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
    httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
    references_i(SolrType.num_integer, true, true, false, "number of unique http references; used for ranking"),
    clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
-
+    process_sxt(SolrType.string, true, true, true, "needed (post-)processing steps on this metadata set"),
+    
    // optional but recommended, part of index distribution
    load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"),
    fresh_date_dt(SolrType.date, true, true, false, "date until resource shall be considered as fresh"),
--- a/source/net/yacy/kelondro/data/meta/DigestURI.java
+++ b/source/net/yacy/kelondro/data/meta/DigestURI.java
@ -32,6 +32,7 @@ import java.io.Serializable;
 import java.net.MalformedURLException;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.regex.Pattern;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
@ -279,10 +280,18 @@ public class DigestURI extends MultiProtocolURI implements Serializable {

    private static final char rootURLFlag0 = subdomPortPath("", 80, "");
    private static final char rootURLFlag1 = subdomPortPath("www", 80, "");
+    private static final char rootURLFlag2 = subdomPortPath("", 21, "");
+    private static final char rootURLFlag3 = subdomPortPath("ftp", 21, "");
+
+    public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php");
+    
+    public final boolean probablyRootURL() {
+        return this.path.length() == 0 || rootPattern.matcher(this.path).matches() || probablyRootURL(this.hash);
+    }

    public static final boolean probablyRootURL(final byte[] urlHash) {
-    	final char c = (char) urlHash[5];
-        return c == rootURLFlag0 || c == rootURLFlag1;
+        final char c = (char) urlHash[5];
+        return c == rootURLFlag0 || c == rootURLFlag1 || c == rootURLFlag2 || c == rootURLFlag3;
    }

    private static final String hosthash5(final String protocol, final String host, final int port) {
--- a/source/net/yacy/kelondro/util/ByteBuffer.java
+++ b/source/net/yacy/kelondro/util/ByteBuffer.java
@ -226,6 +226,12 @@ public final class ByteBuffer extends OutputStream {
        return true;
    }

+    public static boolean equals(final byte[] b0, final int off0, final byte[] b1, final int off1, final int length) {
+        if (b0.length - off0 < length || b1.length - off1 < length) return false;
+        for (int i = 0; i < length; i++) if (b0[off0 + i] != b1[off1 + i]) return false;
+        return true;
+    }
+
    public void writeTo(final OutputStream dest) throws IOException {
    	dest.write(this.buffer, this.offset, this.length);
        dest.flush();
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -363,7 +363,7 @@ public class Segment {
        char docType = Response.docType(document.dc_format());
        
        // CREATE SOLR DOCUMENT
-        final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language);
+        final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex);
        
        // FIND OUT IF THIS IS A DOUBLE DOCUMENT
        for (YaCySchema[] checkfields: new YaCySchema[][]{
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@ -34,32 +34,39 @@ import java.util.Collection;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
-import java.util.regex.Pattern;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.document.UTF8;
 import net.yacy.cora.federate.solr.FailType;
+import net.yacy.cora.federate.solr.ProcessType;
 import net.yacy.cora.federate.solr.YaCySchema;
 import net.yacy.cora.federate.yacy.ConfigurationSet;
 import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.CommonPattern;
+import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.ImageEntry;
+import net.yacy.kelondro.data.citation.CitationReference;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.index.RowHandleSet;
 import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.rwi.IndexCell;
+import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.util.Bitfield;
+import net.yacy.kelondro.util.ByteBuffer;

 import org.apache.solr.common.SolrInputDocument;

@ -306,23 +313,40 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
    	text = text.trim();
    	if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
    }
-
-    private final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php");
    
-    protected SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) {
+    protected SolrInputDocument yacy2solr(
+            final String id, final CrawlProfile profile, final ResponseHeader responseHeader,
+            final Document document, Condenser condenser, DigestURI referrerURL, String language,
+            IndexCell<CitationReference> citations) {
        // we use the SolrCell design as index scheme
        final SolrInputDocument doc = new SolrInputDocument();
        final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source());
        boolean allAttr = this.isEmpty();
+        
+        Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
+        
        add(doc, YaCySchema.id, id);
        if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
        String docurl = digestURI.toNormalform(true);
        add(doc, YaCySchema.sku, docurl);

        if (allAttr || contains(YaCySchema.clickdepth_i)) {
-            String path = digestURI.getPath();
-            boolean fronturl = path.length() == 0 || rootPattern.matcher(path).matches();
-            add(doc, YaCySchema.clickdepth_i, fronturl ? 0 : -1);
+            boolean fronturl = digestURI.probablyRootURL();
+            if (fronturl) {
+                add(doc, YaCySchema.clickdepth_i, 0);
+            } else {
+                // search the citations for references
+                int clickdepth = -1;
+                try {
+                    clickdepth = getClickDepth(citations, digestURI.hash());
+                } catch (IOException e) {
+                    add(doc, YaCySchema.clickdepth_i, -1);
+                }
+                add(doc, YaCySchema.clickdepth_i, clickdepth);
+                if (clickdepth < 0 || clickdepth > 1) {
+                    processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
+                }
+            }
        }
        
        if (allAttr || contains(YaCySchema.ip_s)) {
@ -800,10 +824,71 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
            Set<String> facetValues = facet.getValue();
            doc.setField(YaCySchema.VOCABULARY_PREFIX + facetName + YaCySchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()]));
        }
-        
+
+        if (allAttr || contains(YaCySchema.process_sxt)) {
+            List<String> p = new ArrayList<String>();
+            for (ProcessType t: processTypes) p.add(t.name());
+            add(doc, YaCySchema.process_sxt, p);
+        }
        return doc;
    }

+    /**
+     * compute the click level using the citation reference database
+     * @param citations the citation database
+     * @param searchhash the hash of the url to be checked
+     * @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
+     * @throws IOException
+     */
+    private int getClickDepth(final IndexCell<CitationReference> citations, byte[] searchhash) throws IOException {
+
+        RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
+        RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
+        try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
+        int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
+        final byte[] hosthash = new byte[6]; // the host of the url to be checked
+        System.arraycopy(searchhash, 6, hosthash, 0, 6);
+        
+        long timeout = System.currentTimeMillis() + 10000;
+        for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) {
+            
+            RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
+            
+            // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
+            checkloop: for (byte[] urlhash: levelhashes) {
+    
+                // get all the citations for this url and iterate
+                ReferenceContainer<CitationReference> references = citations.get(urlhash, null);
+                if (references == null || references.size() == 0) continue checkloop; // don't know
+                Iterator<CitationReference> i = references.entries();
+                nextloop: while (i.hasNext()) {
+                    CitationReference ref = i.next();
+                    if (ref == null) continue nextloop;
+                    byte[] u = ref.urlhash();
+                    
+                    // check ignore
+                    if (ignore.has(u)) continue nextloop;
+                    
+                    // check if this is from the same host
+                    if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
+                    
+                    // check if the url is a root url
+                    if (DigestURI.probablyRootURL(u)) {
+                        return leveldepth + 1;
+                    }
+                    
+                    // step to next depth level
+                    try {checknext.put(u);} catch (SpaceExceededException e) {}
+                    try {ignore.put(u);} catch (SpaceExceededException e) {}
+                }
+            }
+            leveldepth++;
+            levelhashes = checknext;
+        
+        }
+        return -1;
+    }
+    
    /**
     * this method compresses a list of protocol names to an indexed list.
     * To do this, all 'http' entries are removed and considered as default.