new structure and enhancements for link graph computation:

- added order option to solr queries to be able to retrieve document lists in specific order, here: link length - added HyperlinkEdge class which manages the link structure - integrated the HyperlinkEdge class into clickdepth computation - extended the linkstructure.json servlet to show also the clickdepth and other statistic information
11 years ago · bd886054cb
parent df138084c0
commit bd886054cb
21 changed files with 346 additions and 176 deletions
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -288,7 +288,7 @@ public class HostBrowser {
                        q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
                    }
                }
-                BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, 100000, TIMEOUT, 100, 1,
+                BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1,
                        CollectionSchema.id.getSolrFieldName(),
                        CollectionSchema.sku.getSolrFieldName(),
                        CollectionSchema.failreason_s.getSolrFieldName(),
--- a/htroot/IndexDeletion_p.java
+++ b/htroot/IndexDeletion_p.java
@ -130,7 +130,7 @@ public class IndexDeletion_p {
                    }
                    try {
                        DigestURL u = new DigestURL(urlStub);
-                        BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", 0, 100000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
+                        BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", null, 0, 100000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
                        SolrDocument doc;
                        try {
                            while ((doc = dq.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
--- a/htroot/api/citation.java
+++ b/htroot/api/citation.java
@ -127,7 +127,7 @@ public class citation {
            }
            try {
                sentence = sentence.replace('"', '\'');
-                SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName());
+                SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100, CollectionSchema.sku.getSolrFieldName());
                int count = (int) doclist.getNumFound();
                if (count > 0) {
                    Set<DigestURL> list = new TreeSet<DigestURL>();
--- a/htroot/api/linkstructure.java
+++ b/htroot/api/linkstructure.java
@ -17,29 +17,18 @@
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-
 import java.net.MalformedURLException;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.concurrent.BlockingQueue;
-
-import org.apache.solr.common.SolrDocument;

 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.solr.FailType;
-import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
-import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Fulltext;
-import net.yacy.search.schema.CollectionSchema;
 import net.yacy.search.schema.HyperlinkEdge;
+import net.yacy.search.schema.HyperlinkGraph;
 import net.yacy.server.serverObjects;
 import net.yacy.server.serverSwitch;
 import net.yacy.server.servletProperties;
@ -59,8 +48,8 @@ public class linkstructure {
        String about = post.get("about", null); // may be a URL, a URL hash or a domain hash
        if (about == null) return prop;
        boolean authenticated = sb.adminAuthenticated(header) >= 2;
-        int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 60000 : 1000);
-        int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 1000 : 100);
+        int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 300000 : 1000);
+        int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 10000000 : 100);

        DigestURL url = null;
        String hostname = null;
@ -72,104 +61,32 @@ public class linkstructure {
            try {
                url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
                hostname = url.getHost();
-                if (hostname.startsWith("www.")) hostname = hostname.substring(4);
            } catch (final MalformedURLException e) {
            }
        }
        if (hostname == null) return prop;
        
        // now collect _all_ documents inside the domain until a timeout appears
-        StringBuilder q = new StringBuilder();
-        q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
-        BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, maxnodes, maxtime, 100, 1,
-                CollectionSchema.id.getSolrFieldName(),
-                CollectionSchema.sku.getSolrFieldName(),
-                CollectionSchema.failreason_s.getSolrFieldName(),
-                CollectionSchema.failtype_s.getSolrFieldName(),
-                CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
-                CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
-                CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
-                CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
-                );
-        SolrDocument doc;
-        Map<String, FailType> errorDocs = new HashMap<String, FailType>();
-        Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();
-        Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
-        Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
-        try {
-            while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
-                String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
-                String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
-                DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
-                String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
-                FailType error = errortype == null ? null : FailType.valueOf(errortype);
-                if (error != null) {
-                    errorDocs.put(u, error);
-                } else {
-                    Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound
-                    String link;
-                    while (links.hasNext()) {
-                        link = links.next();
-                        try {
-                            DigestURL linkurl = new DigestURL(link, null);
-                            String edgehash = ids + ASCII.String(linkurl.hash());
-                            inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));
-                        } catch (MalformedURLException e) {}
-                    }
-                    links = URIMetadataNode.getLinks(doc, false); // outbound
-                    while (links.hasNext()) {
-                        link = links.next();
-                        try {
-                            DigestURL linkurl = new DigestURL(link, null);
-                            String edgehash = ids + ASCII.String(linkurl.hash());
-                            outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));
-                        } catch (MalformedURLException e) {}
-                    }
-                }
-                if (inboundEdges.size() + outboundEdges.size() > maxnodes) break;
-            }
-        } catch (InterruptedException e) {
-        } catch (MalformedURLException e) {
-        }
-        // we use the errorDocs to mark all edges with endpoint to error documents
-        Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();
-        Map.Entry<String, HyperlinkEdge> edge;
-        while (i.hasNext()) {
-            edge = i.next();
-            if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
-                i.remove();
-                edge.getValue().type = HyperlinkEdge.Type.Dead;
-                errorEdges.put(edge.getKey(), edge.getValue());
-            }
-        }
-        i = outboundEdges.entrySet().iterator();
-        while (i.hasNext()) {
-            edge = i.next();
-            if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
-                i.remove();
-                edge.getValue().type = HyperlinkEdge.Type.Dead;
-                errorEdges.put(edge.getKey(), edge.getValue());
-            }
-        }
-        // we put all edges together in a specific order which is used to create nodes in a svg display:
-        // notes that appear first are possible painted over by nodes coming later.
-        // less important nodes shall appear therefore first
-        Map<String, HyperlinkEdge> edges = new LinkedHashMap<String, HyperlinkEdge>();
-        edges.putAll(outboundEdges);
-        edges.putAll(inboundEdges);
-        edges.putAll(errorEdges);
+        HyperlinkGraph hlg = new HyperlinkGraph();
+        hlg.fill(fulltext.getDefaultConnector(), hostname, maxtime, maxnodes);
+        int maxdepth = hlg.findLinkDepth();
        
        // finally just write out the edge array
        int c = 0;
-        for (Map.Entry<String, HyperlinkEdge> e: edges.entrySet()) {
-            prop.putJSON("list_" + c + "_source", e.getValue().source.getPath());
-            prop.putJSON("list_" + c + "_target", e.getValue().type.equals(HyperlinkEdge.Type.Outbound) ? e.getValue().target.toNormalform(true) : e.getValue().target.getPath());
-            prop.putJSON("list_" + c + "_type", e.getValue().type.name());
-            prop.put("list_" + c + "_eol", 1);
+        for (HyperlinkEdge e: hlg) {
+            prop.putJSON("edges_" + c + "_source", e.source.getPath());
+            prop.putJSON("edges_" + c + "_target", e.type.equals(HyperlinkEdge.Type.Outbound) ? e.target.toNormalform(true) : e.target.getPath());
+            prop.putJSON("edges_" + c + "_type", e.type.name());
+            Integer depth_source = hlg.getDepth(e.source);
+            Integer depth_target = hlg.getDepth(e.target);
+            prop.put("edges_" + c + "_depthSource", depth_source == null ? -1 : depth_source.intValue());
+            prop.put("edges_" + c + "_depthTarget", depth_target == null ? -1 : depth_target.intValue());
+            prop.put("edges_" + c + "_eol", 1);
            c++;
        }
-        prop.put("list_" + (c-1) + "_eol", 0);
-        prop.put("list", c);
+        prop.put("edges_" + (c-1) + "_eol", 0);
+        prop.put("edges", c);
+        prop.put("maxdepth", maxdepth);

        // Adding CORS Access header for xml output
        if (xml) {
--- a/htroot/api/linkstructure.json
+++ b/htroot/api/linkstructure.json
@ -1,5 +1,7 @@
-[
-#{list}#
-{"source":"#[source]#", "target":"#[target]#", "type":"#[type]#"}#(eol)#::,#(/eol)#
-#{/list}#
-]
+{
+"edges" : "#[edges]#",
+"maxdepth" : "#[maxdepth]#",
+"graph" : [#{edges}#
+{"source":"#[source]#", "target":"#[target]#", "type":"#[type]#", "depthSource":"#[depthSource]#", "depthTarget":"#[depthTarget]#"}#(eol)#::,#(/eol)#
+#{/edges}#]
+}
--- a/htroot/js/hypertree.js
+++ b/htroot/js/hypertree.js
@ -1,7 +1,9 @@
 function linkstructure(hostname, element, width, height, maxtime, maxnodes) {
 	var nodes = {};
 	var links = [];
-	$.getJSON("/api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(links) {
+	var linkstructure = {};
+	$.getJSON("/api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(linkstructure) {
+		links = linkstructure.graph;
 		links.forEach(function(link) {
 			  link.source = nodes[link.source] || (nodes[link.source] = {name: link.source, type:"Inbound"});
 			  link.target = nodes[link.target] || (nodes[link.target] = {name: link.target, type:link.type});
--- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
+++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
@ -201,7 +201,7 @@ public class OpenSearchConnector {

        final long numfound;
        try {
-            SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, 0, 1, webgraphqueryfields);
+            SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
            numfound = docList.getNumFound();
            if (numfound == 0) {
                ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job");
@ -226,7 +226,7 @@ public class OpenSearchConnector {
                    Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
                    while (doloop) {
                        ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
-                        SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents
+                        SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents
                        loopnr++;
                        if (stoptime < System.currentTimeMillis()) {// stop after max 1h
                            doloop = false;
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@ -158,7 +158,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
                        continue uniquecheck;
                    }
                    try {
-                        final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", 0, 1);
+                        final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", null, 0, 1);
                        if (docs != null && !docs.isEmpty()) {
                            SolrDocument doc = docs.get(0);
                            // switch unique attribute in new document
--- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@ -134,6 +134,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
     * The method returns immediately and feeds the search results into the queue
     * @param querystring the solr query string
+     * @param sort the solr sort string, may be null to be not used
     * @param offset first result offset
     * @param maxcount the maximum number of results
     * @param maxtime the maximum time in milliseconds
@ -144,6 +145,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
    @Override
    public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(
            final String querystring,
+            final String sort,
            final int offset,
            final int maxcount,
            final long maxtime,
@ -160,7 +162,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
                int count = 0;
                while (System.currentTimeMillis() < endtime && count < maxcount) {
                    try {
-                        SolrDocumentList sdl = getDocumentListByQuery(querystring, o, Math.min(maxcount, pagesize), fields);
+                        SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize), fields);
                        for (SolrDocument d: sdl) {
                            try {queue.put(d);} catch (final InterruptedException e) {break;}
                            count++;
@ -185,6 +187,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
    @Override
    public BlockingQueue<String> concurrentIDsByQuery(
            final String querystring,
+            final String sort,
            final int offset,
            final int maxcount,
            final long maxtime,
@ -199,7 +202,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
                int o = offset;
                while (System.currentTimeMillis() < endtime) {
                    try {
-                        SolrDocumentList sdl = getDocumentListByQuery(querystring, o, Math.min(maxcount, pagesize), CollectionSchema.id.getSolrFieldName());
+                        SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize), CollectionSchema.id.getSolrFieldName());
                        for (SolrDocument d: sdl) {
                            try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;}
                        }
@ -222,7 +225,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {

    @Override
    public Iterator<String> iterator() {
-        final BlockingQueue<String> queue = concurrentIDsByQuery(CATCHALL_QUERY, 0, Integer.MAX_VALUE, 60000, 2 * pagesize, 1);
+        final BlockingQueue<String> queue = concurrentIDsByQuery(CATCHALL_QUERY, null, 0, Integer.MAX_VALUE, 60000, 2 * pagesize, 1);
        return new LookAheadIterator<String>() {
            @Override
            protected String next0() {
@ -245,22 +248,43 @@ public abstract class AbstractSolrConnector implements SolrConnector {
     * @throws IOException
     */
    @Override
-    public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException {
+    public SolrDocumentList getDocumentListByQuery(
+            final String querystring,
+            final String sort,
+            final int offset,
+            final int count,
+            final String ... fields) throws IOException {
+        // construct query
+        final SolrQuery params = getSolrQuery(querystring, sort, offset, count, fields);
+        
+        // query the server
+        final SolrDocumentList docs = getDocumentListByParams(params);
+        return docs;
+    }
+
+    public static SolrQuery getSolrQuery(
+            final String querystring,
+            final String sort,
+            final int offset,
+            final int count,
+            final String ... fields) {
        // construct query
        final SolrQuery params = new SolrQuery();
        params.setQuery(querystring);
+        params.clearSorts();
+        if (sort != null) {
+            params.set("sort", sort);
+        }
        params.setRows(count);
        params.setStart(offset);
        params.setFacet(false);
-        params.clearSorts();
        if (fields.length > 0) params.setFields(fields);
        params.setIncludeScore(false);
        
-        // query the server
-        final SolrDocumentList docs = getDocumentListByParams(params);
-        return docs;
+        return params;
    }
    
+    
    @Override
    public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException {
        final SolrDocumentList sdl = getDocumentListByParams(params);
--- a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java
@ -211,7 +211,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
     * @throws IOException
     */
    @Override
-    public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException {
+    public SolrDocumentList getDocumentListByQuery(final String querystring, final String sort, final int offset, final int count, final String ... fields) throws IOException {
        if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
            ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
             querystring.length() == 15)) {
@ -222,14 +222,14 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
            return list;
        }
        if (this.solr != null) {
-            SolrDocumentList list = this.solr.getDocumentListByQuery(querystring, offset, count, fields);
+            SolrDocumentList list = this.solr.getDocumentListByQuery(querystring, sort, offset, count, fields);
            addToCache(list, fields.length == 0);
            return list;
        }
        
        // combine both lists
        SolrDocumentList list;
-        list = this.solr.getDocumentListByQuery(querystring, offset, count, fields);
+        list = this.solr.getDocumentListByQuery(querystring, sort, offset, count, fields);

        // add caching
        addToCache(list, fields.length == 0);
--- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java
@ -382,7 +382,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
    }
    
    @Override
-    public SolrDocumentList getDocumentListByQuery(String querystring, int offset, int count, String... fields) throws IOException, SolrException {
+    public SolrDocumentList getDocumentListByQuery(String querystring, String sort, int offset, int count, String... fields) throws IOException, SolrException {
        if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
            ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
             querystring.length() == 15)) {
@ -392,7 +392,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
            return list;
        }
        
-        SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
+        SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, sort, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
        /*
        Iterator<SolrDocument> i = sdl.iterator();
        while (i.hasNext()) {
@ -415,13 +415,13 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
    }

    @Override
-    public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) {
-        return this.connector.concurrentDocumentsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency, fields);
+    public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) {
+        return this.connector.concurrentDocumentsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency, fields);
    }

    @Override
-    public BlockingQueue<String> concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) {
-        return this.connector.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
+    public BlockingQueue<String> concurrentIDsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) {
+        return this.connector.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
    }

 }
--- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java
@ -360,16 +360,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
        private SolrQueryRequest request;
        private DocList response;

-        public DocListSearcher(final String querystring, final int offset, final int count, final String ... fields) {
+        public DocListSearcher(final String querystring, String sort, final int offset, final int count, final String ... fields) {
            // construct query
-            final SolrQuery params = new SolrQuery();
-            params.setQuery(querystring);
-            params.setRows(count);
-            params.setStart(offset);
-            params.setFacet(false);
-            params.clearSorts();
-            if (fields.length > 0) params.setFields(fields);
-            params.setIncludeScore(false);
+            final SolrQuery params = AbstractSolrConnector.getSolrQuery(querystring, sort, offset, count, fields);
            
            // query the server
            this.request = EmbeddedSolrConnector.this.request(params);
@ -395,7 +388,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
    	int numFound = 0;
    	DocListSearcher docListSearcher = null;
        try {
-        	docListSearcher = new DocListSearcher(querystring, 0, 0, CollectionSchema.id.getSolrFieldName());
+        	docListSearcher = new DocListSearcher(querystring, null, 0, 0, CollectionSchema.id.getSolrFieldName());
        	numFound = docListSearcher.response.matches();
        } finally { 
        	if (docListSearcher != null) docListSearcher.close();
@ -414,7 +407,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
        int responseCount = 0;
        DocListSearcher docListSearcher = null;
        try {
-            docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
+            docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, null, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
            responseCount = docListSearcher.response.size();
            if (responseCount == 0) return null;
            SolrIndexSearcher searcher = docListSearcher.request.getSearcher();
@ -431,7 +424,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
    }
    
    @Override
-    public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
+    public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
        final BlockingQueue<String> queue = buffersize <= 0 ? new LinkedBlockingQueue<String>() : new ArrayBlockingQueue<String>(buffersize);
        final long endtime = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
        final Thread t = new Thread() {
@ -443,7 +436,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
                while (System.currentTimeMillis() < endtime) {
                    try {
                    	responseCount = 0;
-                        docListSearcher = new DocListSearcher(querystring, o, pagesize, CollectionSchema.id.getSolrFieldName());
+                        docListSearcher = new DocListSearcher(querystring, sort, o, pagesize, CollectionSchema.id.getSolrFieldName());
                        responseCount = docListSearcher.response.size();
                        SolrIndexSearcher searcher = docListSearcher.request.getSearcher();
                        DocIterator iterator = docListSearcher.response.iterator();
--- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
@ -218,7 +218,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
     * @throws IOException
     */
    @Override
-    public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException {
+    public SolrDocumentList getDocumentListByQuery(final String querystring, final String sort, final int offset, final int count, final String ... fields) throws IOException {
        if (this.solr0 == null && this.solr1 == null) return new SolrDocumentList();
        if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
            ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
@ -230,31 +230,31 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
            return list;
        }
        if (this.solr0 != null && this.solr1 == null) {
-            SolrDocumentList list = this.solr0.getDocumentListByQuery(querystring, offset, count, fields);
+            SolrDocumentList list = this.solr0.getDocumentListByQuery(querystring, sort, offset, count, fields);
            return list;
        }
        if (this.solr1 != null && this.solr0 == null) {
-            SolrDocumentList list = this.solr1.getDocumentListByQuery(querystring, offset, count, fields);
+            SolrDocumentList list = this.solr1.getDocumentListByQuery(querystring, sort, offset, count, fields);
            return list;
        }

        // combine both lists
        SolrDocumentList l;
-        l = this.solr0.getDocumentListByQuery(querystring, offset, count, fields);
+        l = this.solr0.getDocumentListByQuery(querystring, sort, offset, count, fields);
        if (l.size() >= count) return l;

        // at this point we need to know how many results are in solr0
        // compute this with a very bad hack; replace with better method later
        int size0 = 0;
        { //bad hack - TODO: replace
-            SolrDocumentList lHack = this.solr0.getDocumentListByQuery(querystring, 0, Integer.MAX_VALUE, fields);
+            SolrDocumentList lHack = this.solr0.getDocumentListByQuery(querystring, sort, 0, Integer.MAX_VALUE, fields);
            size0 = lHack.size();
        }

        // now use the size of the first query to do a second query
        final SolrDocumentList list = new SolrDocumentList();
        for (final SolrDocument d: l) list.add(d);
-        l = this.solr1.getDocumentListByQuery(querystring, offset + l.size() - size0, count - l.size(), fields);
+        l = this.solr1.getDocumentListByQuery(querystring, sort, offset + l.size() - size0, count - l.size(), fields);
        for (final SolrDocument d: l) list.add(d);

        return list;
@ -427,10 +427,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
    }

    @Override
-    public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
-        if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
-        if (this.solr0 == null && this.solr1 != null) return this.solr1.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
-        return super.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
+    public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
+        if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
+        if (this.solr0 == null && this.solr1 != null) return this.solr1.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
+        return super.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
    }
    
 }
--- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@ -180,12 +180,18 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     * get a query result from solr
     * to get all results set the query String to "*:*"
     * @param querystring the solr query string
+     * @param sort the solr sort string, may be null to be not used
     * @param offset the first result offset
     * @param count number of wanted results
     * @param fields list of fields
     * @throws IOException
     */
-    public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException, SolrException;
+    public SolrDocumentList getDocumentListByQuery(
+            final String querystring,
+            final String sort,
+            final int offset,
+            final int count,
+            final String ... fields) throws IOException, SolrException;
    
    /**
     * get the number of results when this query is done.
@ -210,6 +216,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
     * The method returns immediately and feeds the search results into the queue
     * @param querystring the solr query string
+     * @param sort the solr sort string, may be null to be not used
     * @param offset first result offset
     * @param maxcount the maximum number of results
     * @param maxtime the maximum time in milliseconds
@ -220,6 +227,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     */
    public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(
            final String querystring,
+            final String sort,
            final int offset,
            final int maxcount,
            final long maxtime,
@ -232,6 +240,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     * The result queue is considered as terminated if AbstractSolrConnector.POISON_ID is returned.
     * The method returns immediately and feeds the search results into the queue
     * @param querystring
+     * @param sort the solr sort string, may be null to be not used
     * @param offset
     * @param maxcount
     * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
@ -240,6 +249,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     */
    public BlockingQueue<String> concurrentIDsByQuery(
            final String querystring,
+            final String sort,
            final int offset,
            final int maxcount,
            final long maxtime,
--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@ -171,7 +171,7 @@ public class ErrorCache {
        }
        if (failDoc != null) return failDoc;
        try {
-            final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 0, 1);
+            final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, null, 0, 1);
            if (docs == null || docs.isEmpty()) return null;
            SolrDocument doc = docs.get(0);
            if (doc == null) return null;
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -428,7 +428,7 @@ public final class Fulltext {
        final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
                ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
        final AtomicInteger count = new AtomicInteger(0);
-        final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
+        final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, null, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
        try {
            Set<String> deleteIDs = new HashSet<String>();
            SolrDocument doc;
@ -664,7 +664,7 @@ public final class Fulltext {
                        this.count++;
                    }
                } else {
-                    BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, 1,
+                    BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, 10 * 60 * 60 * 1000, 100, 1,
                            CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
                            CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
                    SolrDocument doc;
--- a/source/net/yacy/search/index/ReindexSolrBusyThread.java
+++ b/source/net/yacy/search/index/ReindexSolrBusyThread.java
@ -113,7 +113,7 @@ import org.apache.solr.common.SolrInputDocument;
                if (sem.tryAcquire()) {
                    try {
                        String query = querylist.get(0);
-                        SolrDocumentList xdocs = esc.getDocumentListByQuery(query, start, chunksize);
+                        SolrDocumentList xdocs = esc.getDocumentListByQuery(query, null, start, chunksize);
                        docstoreindex = (int) xdocs.getNumFound();
                        
                        if (xdocs.size() == 0) { // no documents returned = all of current query reindexed (or eventual start to large)                                                       
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -30,6 +30,7 @@ import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@ -80,6 +81,7 @@ import net.yacy.repository.LoaderDispatcher;
 import net.yacy.search.query.SearchEvent;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.CollectionSchema;
+import net.yacy.search.schema.HyperlinkGraph;
 import net.yacy.search.schema.WebgraphConfiguration;
 import net.yacy.search.schema.WebgraphSchema;

@ -259,21 +261,13 @@ public class Segment {
        return 999;
    }
    
+    
    private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
        RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
        String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
        try {
            rootCandidates.put(new DigestURL(rootStub).hash());
-            rootCandidates.put(new DigestURL(rootStub + "/").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/index.htm").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/index.html").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/index.php").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/home.htm").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/home.html").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/home.php").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/default.htm").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/default.html").hash());
-            rootCandidates.put(new DigestURL(rootStub + "/default.php").hash());
+            for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash());
            rootCandidates.optimize();
        } catch (final Throwable e) {}
        rootCandidates.optimize();
@ -310,22 +304,41 @@ public class Segment {
    
    public class ClickdepthCache {
        private final ReferenceReportCache rrc;
+        private final Map<String, HyperlinkGraph> hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name
        private final Map<String, Integer> cache;
        public final int maxdepth; // maximum clickdepth
        public final int maxtime; // maximum time to compute clickdepth
        public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
            this.rrc = rrc;
+            this.hyperlinkGraphCache = new HashMap<String, HyperlinkGraph>();
            this.cache = new ConcurrentHashMap<String, Integer>();
            this.maxdepth = maxdepth;
            this.maxtime = maxtime;
        }
        public int getClickdepth(final DigestURL url) throws IOException {
+            // first try: get the clickdepth from the cache
            Integer clickdepth = cache.get(ASCII.String(url.hash()));
            if (MemoryControl.shortStatus()) cache.clear();
            if (clickdepth != null) {
                //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
                return clickdepth.intValue();
            }
+            
+            // second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth)
+            HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost());
+            if (hlg == null) {
+                hlg = new HyperlinkGraph();
+                hlg.fill(fulltext.getDefaultConnector(), url.getHost(), 300000, 10000000);
+                hlg.findLinkDepth();
+                hyperlinkGraphCache.put(url.getHost(), hlg);
+            }
+            clickdepth = hlg.getDepth(url);
+            if (clickdepth != null) {
+                return clickdepth.intValue();
+            }
+                    
+            
+            // third try: get the clickdepth from a reverse link graph
            clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
            //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
            this.cache.put(ASCII.String(url.hash()), clickdepth);
@ -375,7 +388,7 @@ public class Segment {
            if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
                // reqd the references from the webgraph
                SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
-                BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName());
+                BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName());
                SolrDocument doc;
                try {
                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
@ -478,12 +491,12 @@ public class Segment {
        final BlockingQueue<SolrDocument> docQueue;
        final String urlstub;
        if (stub == null) {
-            docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(AbstractSolrConnector.CATCHALL_QUERY, 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
+            docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(AbstractSolrConnector.CATCHALL_QUERY, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
            urlstub = null;
        } else {
            final String host = stub.getHost();
            String hh = DigestURL.hosthash(host);
-            docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
+            docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
            urlstub = stub.toNormalform(true);
        }

--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -962,7 +962,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
                String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
                long patchquerycount = collectionConnector.getCountByQuery(patchquery);
-                BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 600000, 200, 1,
+                BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 600000, 200, 1,
                        CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
                SolrDocument doc_B;
                int patchquerycountcheck = 0;
@ -1044,7 +1044,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    final long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query);
                    int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
                    ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency);
-                    final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 200, concurrency);
+                    final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1800000, 200, concurrency);
                    final AtomicInteger proccount = new AtomicInteger(0);
                    Thread[] t = new Thread[concurrency];
                    for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) {
@ -1151,7 +1151,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            long count = collectionConnector.getCountByQuery(query);
            long start = System.currentTimeMillis();
            ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
-            BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 200, 1);
+            BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1800000, 200, 1);
            int countcheck = 0;
            Collection<String> failids = new ArrayList<String>();
            SolrDocument doc;
@ -1274,7 +1274,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            this.crt = new ConcurrentHashMap<String, double[]>();
            try {
                // select all documents for each host
-                BlockingQueue<String> ids = connector.concurrentIDsByQuery("{!raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, 0, 10000000, 600000, 200, 1);
+                BlockingQueue<String> ids = connector.concurrentIDsByQuery("{!raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 600000, 200, 1);
                String id;
                while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) {
                    this.crt.put(id, new double[]{0.0d,0.0d}); //{old value, new value}
--- a/source/net/yacy/search/schema/HyperlinkEdge.java
+++ b/source/net/yacy/search/schema/HyperlinkEdge.java
@ -37,4 +37,16 @@ public class HyperlinkEdge {
        this.type = type;
    }
    
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder(120);
+        sb.append(this.source.toNormalform(true));
+        sb.append(" -> ");
+        sb.append(this.target.toNormalform(true));
+        sb.append(" (");
+        sb.append(type.name());
+        sb.append(")");
+        return sb.toString();
+    }
+    
 }
--- a/source/net/yacy/search/schema/HyperlinkGraph.java
+++ b/source/net/yacy/search/schema/HyperlinkGraph.java
@ -0,0 +1,197 @@
+/**
+ *  HyperlinkGraph
+ *  Copyright 2014 by Michael Peter Christen
+ *  First released 08.04.2014 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.search.schema;
+
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.BlockingQueue;
+
+import net.yacy.cora.document.encoding.ASCII;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.federate.solr.FailType;
+import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
+import net.yacy.cora.federate.solr.connector.SolrConnector;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
+
+import org.apache.solr.common.SolrDocument;
+
+
+public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
+    
+    public final static Set<String> ROOTFNS = new HashSet<String>();
+    static {
+        for (String s: new String[]{"/", "/index.htm", "/index.html", "/index.php", "/home.htm", "/home.html", "/home.php", "/default.htm", "/default.html", "/default.php"}) {
+            ROOTFNS.add(s);
+        }
+    }
+    
+    Map<String, HyperlinkEdge> edges;
+    Map<DigestURL, Integer> depths;
+    String hostname;
+    
+    public HyperlinkGraph() {
+        this.edges = new LinkedHashMap<String, HyperlinkEdge>();
+        this.depths = new HashMap<DigestURL, Integer>();
+        this.hostname = null;
+    }
+    
+    public void fill(final SolrConnector solrConnector, String hostname, final int maxtime, final int maxnodes) {
+        this.hostname = hostname;
+        if (hostname.startsWith("www.")) hostname = hostname.substring(4);
+        StringBuilder q = new StringBuilder();
+        q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
+        BlockingQueue<SolrDocument> docs = solrConnector.concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, 100, 1,
+                CollectionSchema.id.getSolrFieldName(),
+                CollectionSchema.sku.getSolrFieldName(),
+                CollectionSchema.failreason_s.getSolrFieldName(),
+                CollectionSchema.failtype_s.getSolrFieldName(),
+                CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
+                CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
+                CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
+                CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
+                );
+        SolrDocument doc;
+        Map<String, FailType> errorDocs = new HashMap<String, FailType>();
+        Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();
+        Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
+        Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
+        try {
+            while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
+                String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
+                DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
+                String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
+                FailType error = errortype == null ? null : FailType.valueOf(errortype);
+                if (error != null) {
+                    errorDocs.put(u, error);
+                } else {
+                    Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound
+                    String link;
+                    while (links.hasNext()) {
+                        link = links.next();
+                        try {
+                            DigestURL linkurl = new DigestURL(link, null);
+                            String edgehash = ids + ASCII.String(linkurl.hash());
+                            inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));
+                        } catch (MalformedURLException e) {}
+                    }
+                    links = URIMetadataNode.getLinks(doc, false); // outbound
+                    while (links.hasNext()) {
+                        link = links.next();
+                        try {
+                            DigestURL linkurl = new DigestURL(link, null);
+                            String edgehash = ids + ASCII.String(linkurl.hash());
+                            outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));
+                        } catch (MalformedURLException e) {}
+                    }
+                }
+                if (inboundEdges.size() + outboundEdges.size() > maxnodes) {
+                    break;
+                }
+            }
+        } catch (InterruptedException e) {
+        } catch (MalformedURLException e) {
+        }
+        // we use the errorDocs to mark all edges with endpoint to error documents
+        Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();
+        Map.Entry<String, HyperlinkEdge> edge;
+        while (i.hasNext()) {
+            edge = i.next();
+            if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
+                i.remove();
+                edge.getValue().type = HyperlinkEdge.Type.Dead;
+                errorEdges.put(edge.getKey(), edge.getValue());
+            }
+        }
+        i = outboundEdges.entrySet().iterator();
+        while (i.hasNext()) {
+            edge = i.next();
+            if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
+                i.remove();
+                edge.getValue().type = HyperlinkEdge.Type.Dead;
+                errorEdges.put(edge.getKey(), edge.getValue());
+            }
+        }
+        // we put all edges together in a specific order which is used to create nodes in a svg display:
+        // notes that appear first are possible painted over by nodes coming later.
+        // less important nodes shall appear therefore first
+        this.edges.putAll(outboundEdges);
+        this.edges.putAll(inboundEdges);
+        this.edges.putAll(errorEdges);
+    }
+    
+    public int findLinkDepth() {
+
+        int remaining = this.edges.size();
+        
+        // first find root nodes
+        Set<DigestURL> nodes = new HashSet<DigestURL>();
+        Set<DigestURL> nextnodes = new HashSet<DigestURL>();
+        for (HyperlinkEdge edge: this.edges.values()) {
+            String path = edge.source.getPath();
+            if (ROOTFNS.contains(path)) {
+                if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, 0);
+                if (edge.type == HyperlinkEdge.Type.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, 1);
+                nodes.add(edge.source);
+                nextnodes.add(edge.target);
+                remaining--;
+            }
+        }
+        if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
+
+        // recusively step into depth and find next level
+        int depth = 1;
+        while (remaining > 0) {
+            boolean found = false;
+            nodes = nextnodes;
+            nextnodes = new HashSet<DigestURL>();
+            for (HyperlinkEdge edge: this.edges.values()) {
+                if (nodes.contains(edge.source)) {
+                    if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, depth);
+                    if (edge.type == HyperlinkEdge.Type.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, depth + 1);
+                    nextnodes.add(edge.target);
+                    remaining--;
+                    found = true;
+                }
+            }
+            depth++;
+            if (!found) break; // terminating in case that not all edges are linked together
+        }
+        if (remaining > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find all edges for " + hostname + ", " + remaining + " remaining.");
+        return depth - 1;
+    }
+    
+    public Integer getDepth(DigestURL url) {
+        return this.depths.get(url);
+    }
+
+    @Override
+    public Iterator<HyperlinkEdge> iterator() {
+        return this.edges.values().iterator();
+    }
+    
+}