introduced new solr field crawldepth_i which records the crawl depth of

a document. This is the upper limit for the clickdepth_i value which may be shorter in case that the crawler did not take the shortest path to the document.
11 years ago · cca851a417
parent d321b0314e
commit cca851a417
6 changed files with 38 additions and 11 deletions
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@ -87,6 +87,9 @@ references_exthosts_i
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
 clickdepth_i

+## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
+crawldepth_i
+
 ## needed (post-)processing steps on this metadata set
 process_sxt

--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -298,6 +298,7 @@ public class HostBrowser {
                        CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
                        CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
                        CollectionSchema.clickdepth_i.getSolrFieldName(),
+                        CollectionSchema.crawldepth_i.getSolrFieldName(),
                        CollectionSchema.references_i.getSolrFieldName(),
                        CollectionSchema.references_internal_i.getSolrFieldName(),
                        CollectionSchema.references_external_i.getSolrFieldName(),
@ -560,17 +561,19 @@ public class HostBrowser {
    public static final class InfoCacheEntry {
        public Integer cr_n;
        public Double  cr_c;
-        public int clickdepth, references, references_internal, references_external, references_exthosts;
+        public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts;
        public List<String> references_internal_urls, references_external_urls;
        public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
            this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
-            this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());            
+            this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
            Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
+            Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
            Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
            Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
            Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
            Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
            this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
+            this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue();
            this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
            this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
            // calculate the url reference list
@ -622,14 +625,11 @@ public class HostBrowser {
            }
            if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
            return
-                    (this.clickdepth >= 0 ?
-                            "clickdepth: " + this.clickdepth :
-                            "") +
+                    (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") +
+                    (this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") +
                    (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
                    (this.cr_n != null ? ", crn=" + this.cr_n : "") +
-                    (this.references >= 0 ?
-                            ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() :
-                            "");
+                    (this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : "");
        }
    }

--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -95,6 +95,7 @@ public class Document {
    private final Object parserObject; // the source object that was used to create the Document
    private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
    private final Date date;
+    private int crawldepth;

    public Document(final DigestURL location, final String mimeType, final String charset,
                    final Object parserObject,
@ -146,8 +147,9 @@ public class Document {
        this.text = text == null ? "" : text;
        this.generic_facets = new HashMap<String, Set<String>>();
        this.date = date == null ? new Date() : date;
+        this.crawldepth = 999; // unknown yet
    }
-
+    
    /**
     * Get the content domain of a document. This tries to get the content domain from the mime type
     * and if this fails it uses alternatively the content domain from the file extension.
@ -740,6 +742,14 @@ dc_rights
        return this.indexingDenied;
    }

+    public void setDepth(int depth) {
+        this.crawldepth = depth;
+    }
+    
+    public int getDepth() {
+        return this.crawldepth;
+    }
+    
    public void writeXML(final Writer os, final Date date) throws IOException {
        os.write("<record>\n");
        final String title = dc_title();
@ -819,6 +829,7 @@ dc_rights
        double lon = 0.0d, lat = 0.0d;
        Date date = new Date();

+        int mindepth = 999;
        for (final Document doc: docs) {

        	if (doc == null) continue;
@ -857,6 +868,8 @@ dc_rights
            images.putAll(doc.getImages());
            if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
            if (doc.date.before(date)) date = doc.date;
+            
+            if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
        }

        // clean up parser data
@ -871,7 +884,7 @@ dc_rights
        // return consolidation
        ArrayList<String> titlesa = new ArrayList<String>();
        titlesa.addAll(titles);
-        return new Document(
+        Document newDoc = new Document(
                location,
                globalMime,
                null,
@ -890,6 +903,8 @@ dc_rights
                images,
                false,
                date);
+        newDoc.setDepth(mindepth);
+        return newDoc;
    }

    public static Map<DigestURL, String> getHyperlinks(final Document[] documents) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2591,6 +2591,9 @@ public final class Switchboard extends serverSwitch {
                response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(true)).matches()
            )
           ) {
+            
+            for (Document d: documents) d.setDepth(response.depth());
+            
            // get the hyperlinks
            final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
            if (response.profile().indexMedia()) {
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -377,10 +377,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            } else {
                clickdepth = 999;
            }
-            processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
+            if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that.
+            if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
            CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
        }
        
+        if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
+            CollectionSchema.crawldepth_i.add(doc, document.getDepth());
+        }
+        
        if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
            processTypes.add(ProcessType.CITATION); // postprocessing needed
        }
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -58,6 +58,7 @@ public enum CollectionSchema implements SchemaDeclaration {
    references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
    references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
    clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
+    crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
    process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
    harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),