From bd886054cb0ee3e4f562e88ec2127d432d185359 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 9 Apr 2014 12:45:04 +0200 Subject: [PATCH] new structure and enhancements for link graph computation: - added order option to solr queries to be able to retrieve document lists in specific order, here: link length - added HyperlinkEdge class which manages the link structure - integrated the HyperlinkEdge class into clickdepth computation - extended the linkstructure.json servlet to show also the clickdepth and other statistic information --- htroot/HostBrowser.java | 2 +- htroot/IndexDeletion_p.java | 2 +- htroot/api/citation.java | 2 +- htroot/api/linkstructure.java | 119 ++--------- htroot/api/linkstructure.json | 12 +- htroot/js/hypertree.js | 4 +- .../opensearch/OpenSearchConnector.java | 4 +- .../federate/solr/SchemaConfiguration.java | 2 +- .../solr/connector/AbstractSolrConnector.java | 42 +++- .../solr/connector/CachedSolrConnector.java | 6 +- .../ConcurrentUpdateSolrConnector.java | 12 +- .../solr/connector/EmbeddedSolrConnector.java | 19 +- .../solr/connector/MirrorSolrConnector.java | 20 +- .../solr/connector/SolrConnector.java | 12 +- source/net/yacy/search/index/ErrorCache.java | 2 +- source/net/yacy/search/index/Fulltext.java | 4 +- .../search/index/ReindexSolrBusyThread.java | 2 +- source/net/yacy/search/index/Segment.java | 39 ++-- .../schema/CollectionConfiguration.java | 8 +- .../net/yacy/search/schema/HyperlinkEdge.java | 12 ++ .../yacy/search/schema/HyperlinkGraph.java | 197 ++++++++++++++++++ 21 files changed, 346 insertions(+), 176 deletions(-) create mode 100644 source/net/yacy/search/schema/HyperlinkGraph.java diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 114fa3824..dbcc56bba 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -288,7 +288,7 @@ public class HostBrowser { q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM); } } - BlockingQueue docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, 100000, TIMEOUT, 100, 1, + BlockingQueue docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.failreason_s.getSolrFieldName(), diff --git a/htroot/IndexDeletion_p.java b/htroot/IndexDeletion_p.java index e569e8116..224ae41fd 100644 --- a/htroot/IndexDeletion_p.java +++ b/htroot/IndexDeletion_p.java @@ -130,7 +130,7 @@ public class IndexDeletion_p { } try { DigestURL u = new DigestURL(urlStub); - BlockingQueue dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", 0, 100000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); + BlockingQueue dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", null, 0, 100000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); SolrDocument doc; try { while ((doc = dq.take()) != AbstractSolrConnector.POISON_DOCUMENT) { diff --git a/htroot/api/citation.java b/htroot/api/citation.java index 2d76f3fc9..079efab2d 100644 --- a/htroot/api/citation.java +++ b/htroot/api/citation.java @@ -127,7 +127,7 @@ public class citation { } try { sentence = sentence.replace('"', '\''); - SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName()); + SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100, CollectionSchema.sku.getSolrFieldName()); int count = (int) doclist.getNumFound(); if (count > 0) { Set list = new TreeSet(); diff --git a/htroot/api/linkstructure.java b/htroot/api/linkstructure.java index e988c18f3..4db2f96fd 100644 --- a/htroot/api/linkstructure.java +++ b/htroot/api/linkstructure.java @@ -17,29 +17,18 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - import java.net.MalformedURLException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.concurrent.BlockingQueue; - -import org.apache.solr.common.SolrDocument; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.federate.solr.FailType; -import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; -import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.HyperlinkEdge; +import net.yacy.search.schema.HyperlinkGraph; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; import net.yacy.server.servletProperties; @@ -59,8 +48,8 @@ public class linkstructure { String about = post.get("about", null); // may be a URL, a URL hash or a domain hash if (about == null) return prop; boolean authenticated = sb.adminAuthenticated(header) >= 2; - int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 60000 : 1000); - int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 1000 : 100); + int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 300000 : 1000); + int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 10000000 : 100); DigestURL url = null; String hostname = null; @@ -72,104 +61,32 @@ public class linkstructure { try { url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains hostname = url.getHost(); - if (hostname.startsWith("www.")) hostname = hostname.substring(4); } catch (final MalformedURLException e) { } } if (hostname == null) return prop; // now collect _all_ documents inside the domain until a timeout appears - StringBuilder q = new StringBuilder(); - q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname); - BlockingQueue docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, maxnodes, maxtime, 100, 1, - CollectionSchema.id.getSolrFieldName(), - CollectionSchema.sku.getSolrFieldName(), - CollectionSchema.failreason_s.getSolrFieldName(), - CollectionSchema.failtype_s.getSolrFieldName(), - CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(), - CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(), - CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), - CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() - ); - SolrDocument doc; - Map errorDocs = new HashMap(); - Map inboundEdges = new HashMap(); - Map outboundEdges = new HashMap(); - Map errorEdges = new HashMap(); - try { - while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); - String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - DigestURL from = new DigestURL(u, ASCII.getBytes(ids)); - String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); - FailType error = errortype == null ? null : FailType.valueOf(errortype); - if (error != null) { - errorDocs.put(u, error); - } else { - Iterator links = URIMetadataNode.getLinks(doc, true); // inbound - String link; - while (links.hasNext()) { - link = links.next(); - try { - DigestURL linkurl = new DigestURL(link, null); - String edgehash = ids + ASCII.String(linkurl.hash()); - inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound)); - } catch (MalformedURLException e) {} - } - links = URIMetadataNode.getLinks(doc, false); // outbound - while (links.hasNext()) { - link = links.next(); - try { - DigestURL linkurl = new DigestURL(link, null); - String edgehash = ids + ASCII.String(linkurl.hash()); - outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound)); - } catch (MalformedURLException e) {} - } - } - if (inboundEdges.size() + outboundEdges.size() > maxnodes) break; - } - } catch (InterruptedException e) { - } catch (MalformedURLException e) { - } - // we use the errorDocs to mark all edges with endpoint to error documents - Iterator> i = inboundEdges.entrySet().iterator(); - Map.Entry edge; - while (i.hasNext()) { - edge = i.next(); - if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) { - i.remove(); - edge.getValue().type = HyperlinkEdge.Type.Dead; - errorEdges.put(edge.getKey(), edge.getValue()); - } - } - i = outboundEdges.entrySet().iterator(); - while (i.hasNext()) { - edge = i.next(); - if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) { - i.remove(); - edge.getValue().type = HyperlinkEdge.Type.Dead; - errorEdges.put(edge.getKey(), edge.getValue()); - } - } - // we put all edges together in a specific order which is used to create nodes in a svg display: - // notes that appear first are possible painted over by nodes coming later. - // less important nodes shall appear therefore first - Map edges = new LinkedHashMap(); - edges.putAll(outboundEdges); - edges.putAll(inboundEdges); - edges.putAll(errorEdges); + HyperlinkGraph hlg = new HyperlinkGraph(); + hlg.fill(fulltext.getDefaultConnector(), hostname, maxtime, maxnodes); + int maxdepth = hlg.findLinkDepth(); // finally just write out the edge array int c = 0; - for (Map.Entry e: edges.entrySet()) { - prop.putJSON("list_" + c + "_source", e.getValue().source.getPath()); - prop.putJSON("list_" + c + "_target", e.getValue().type.equals(HyperlinkEdge.Type.Outbound) ? e.getValue().target.toNormalform(true) : e.getValue().target.getPath()); - prop.putJSON("list_" + c + "_type", e.getValue().type.name()); - prop.put("list_" + c + "_eol", 1); + for (HyperlinkEdge e: hlg) { + prop.putJSON("edges_" + c + "_source", e.source.getPath()); + prop.putJSON("edges_" + c + "_target", e.type.equals(HyperlinkEdge.Type.Outbound) ? e.target.toNormalform(true) : e.target.getPath()); + prop.putJSON("edges_" + c + "_type", e.type.name()); + Integer depth_source = hlg.getDepth(e.source); + Integer depth_target = hlg.getDepth(e.target); + prop.put("edges_" + c + "_depthSource", depth_source == null ? -1 : depth_source.intValue()); + prop.put("edges_" + c + "_depthTarget", depth_target == null ? -1 : depth_target.intValue()); + prop.put("edges_" + c + "_eol", 1); c++; } - prop.put("list_" + (c-1) + "_eol", 0); - prop.put("list", c); + prop.put("edges_" + (c-1) + "_eol", 0); + prop.put("edges", c); + prop.put("maxdepth", maxdepth); // Adding CORS Access header for xml output if (xml) { diff --git a/htroot/api/linkstructure.json b/htroot/api/linkstructure.json index 84cec3aec..9421a56b0 100644 --- a/htroot/api/linkstructure.json +++ b/htroot/api/linkstructure.json @@ -1,5 +1,7 @@ -[ -#{list}# -{"source":"#[source]#", "target":"#[target]#", "type":"#[type]#"}#(eol)#::,#(/eol)# -#{/list}# -] \ No newline at end of file +{ +"edges" : "#[edges]#", +"maxdepth" : "#[maxdepth]#", +"graph" : [#{edges}# +{"source":"#[source]#", "target":"#[target]#", "type":"#[type]#", "depthSource":"#[depthSource]#", "depthTarget":"#[depthTarget]#"}#(eol)#::,#(/eol)# +#{/edges}#] +} \ No newline at end of file diff --git a/htroot/js/hypertree.js b/htroot/js/hypertree.js index d05b931b1..ef0014877 100644 --- a/htroot/js/hypertree.js +++ b/htroot/js/hypertree.js @@ -1,7 +1,9 @@ function linkstructure(hostname, element, width, height, maxtime, maxnodes) { var nodes = {}; var links = []; - $.getJSON("/api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(links) { + var linkstructure = {}; + $.getJSON("/api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(linkstructure) { + links = linkstructure.graph; links.forEach(function(link) { link.source = nodes[link.source] || (nodes[link.source] = {name: link.source, type:"Inbound"}); link.target = nodes[link.target] || (nodes[link.target] = {name: link.target, type:link.type}); diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index 14ced0fed..9590da53b 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -201,7 +201,7 @@ public class OpenSearchConnector { final long numfound; try { - SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, 0, 1, webgraphqueryfields); + SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields); numfound = docList.getNumFound(); if (numfound == 0) { ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job"); @@ -226,7 +226,7 @@ public class OpenSearchConnector { Set dblmem = new HashSet(); // temp memory for already checked url while (doloop) { ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound)); - SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents + SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents loopnr++; if (stoptime < System.currentTimeMillis()) {// stop after max 1h doloop = false; diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 3e6601a55..10805fbb5 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -158,7 +158,7 @@ public class SchemaConfiguration extends Configuration implements Serializable { continue uniquecheck; } try { - final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", 0, 1); + final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", null, 0, 1); if (docs != null && !docs.isEmpty()) { SolrDocument doc = docs.get(0); // switch unique attribute in new document diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index ad0dc0ef4..cb8313aef 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -134,6 +134,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. * The method returns immediately and feeds the search results into the queue * @param querystring the solr query string + * @param sort the solr sort string, may be null to be not used * @param offset first result offset * @param maxcount the maximum number of results * @param maxtime the maximum time in milliseconds @@ -144,6 +145,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { @Override public BlockingQueue concurrentDocumentsByQuery( final String querystring, + final String sort, final int offset, final int maxcount, final long maxtime, @@ -160,7 +162,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { int count = 0; while (System.currentTimeMillis() < endtime && count < maxcount) { try { - SolrDocumentList sdl = getDocumentListByQuery(querystring, o, Math.min(maxcount, pagesize), fields); + SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize), fields); for (SolrDocument d: sdl) { try {queue.put(d);} catch (final InterruptedException e) {break;} count++; @@ -185,6 +187,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { @Override public BlockingQueue concurrentIDsByQuery( final String querystring, + final String sort, final int offset, final int maxcount, final long maxtime, @@ -199,7 +202,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { int o = offset; while (System.currentTimeMillis() < endtime) { try { - SolrDocumentList sdl = getDocumentListByQuery(querystring, o, Math.min(maxcount, pagesize), CollectionSchema.id.getSolrFieldName()); + SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize), CollectionSchema.id.getSolrFieldName()); for (SolrDocument d: sdl) { try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;} } @@ -222,7 +225,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { @Override public Iterator iterator() { - final BlockingQueue queue = concurrentIDsByQuery(CATCHALL_QUERY, 0, Integer.MAX_VALUE, 60000, 2 * pagesize, 1); + final BlockingQueue queue = concurrentIDsByQuery(CATCHALL_QUERY, null, 0, Integer.MAX_VALUE, 60000, 2 * pagesize, 1); return new LookAheadIterator() { @Override protected String next0() { @@ -245,22 +248,43 @@ public abstract class AbstractSolrConnector implements SolrConnector { * @throws IOException */ @Override - public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException { + public SolrDocumentList getDocumentListByQuery( + final String querystring, + final String sort, + final int offset, + final int count, + final String ... fields) throws IOException { + // construct query + final SolrQuery params = getSolrQuery(querystring, sort, offset, count, fields); + + // query the server + final SolrDocumentList docs = getDocumentListByParams(params); + return docs; + } + + public static SolrQuery getSolrQuery( + final String querystring, + final String sort, + final int offset, + final int count, + final String ... fields) { // construct query final SolrQuery params = new SolrQuery(); params.setQuery(querystring); + params.clearSorts(); + if (sort != null) { + params.set("sort", sort); + } params.setRows(count); params.setStart(offset); params.setFacet(false); - params.clearSorts(); if (fields.length > 0) params.setFields(fields); params.setIncludeScore(false); - // query the server - final SolrDocumentList docs = getDocumentListByParams(params); - return docs; + return params; } - + + @Override public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException { final SolrDocumentList sdl = getDocumentListByParams(params); diff --git a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java index 9d09a9040..2147fca85 100644 --- a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java @@ -211,7 +211,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo * @throws IOException */ @Override - public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException { + public SolrDocumentList getDocumentListByQuery(final String querystring, final String sort, final int offset, final int count, final String ... fields) throws IOException { if (offset == 0 && count == 1 && querystring.startsWith("id:") && ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') || querystring.length() == 15)) { @@ -222,14 +222,14 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo return list; } if (this.solr != null) { - SolrDocumentList list = this.solr.getDocumentListByQuery(querystring, offset, count, fields); + SolrDocumentList list = this.solr.getDocumentListByQuery(querystring, sort, offset, count, fields); addToCache(list, fields.length == 0); return list; } // combine both lists SolrDocumentList list; - list = this.solr.getDocumentListByQuery(querystring, offset, count, fields); + list = this.solr.getDocumentListByQuery(querystring, sort, offset, count, fields); // add caching addToCache(list, fields.length == 0); diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index 7a2844aea..755819b28 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -382,7 +382,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } @Override - public SolrDocumentList getDocumentListByQuery(String querystring, int offset, int count, String... fields) throws IOException, SolrException { + public SolrDocumentList getDocumentListByQuery(String querystring, String sort, int offset, int count, String... fields) throws IOException, SolrException { if (offset == 0 && count == 1 && querystring.startsWith("id:") && ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') || querystring.length() == 15)) { @@ -392,7 +392,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { return list; } - SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields)); + SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, sort, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields)); /* Iterator i = sdl.iterator(); while (i.hasNext()) { @@ -415,13 +415,13 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } @Override - public BlockingQueue concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) { - return this.connector.concurrentDocumentsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency, fields); + public BlockingQueue concurrentDocumentsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) { + return this.connector.concurrentDocumentsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency, fields); } @Override - public BlockingQueue concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) { - return this.connector.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency); + public BlockingQueue concurrentIDsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) { + return this.connector.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency); } } diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 27f6b9d68..db3f95434 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -360,16 +360,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo private SolrQueryRequest request; private DocList response; - public DocListSearcher(final String querystring, final int offset, final int count, final String ... fields) { + public DocListSearcher(final String querystring, String sort, final int offset, final int count, final String ... fields) { // construct query - final SolrQuery params = new SolrQuery(); - params.setQuery(querystring); - params.setRows(count); - params.setStart(offset); - params.setFacet(false); - params.clearSorts(); - if (fields.length > 0) params.setFields(fields); - params.setIncludeScore(false); + final SolrQuery params = AbstractSolrConnector.getSolrQuery(querystring, sort, offset, count, fields); // query the server this.request = EmbeddedSolrConnector.this.request(params); @@ -395,7 +388,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo int numFound = 0; DocListSearcher docListSearcher = null; try { - docListSearcher = new DocListSearcher(querystring, 0, 0, CollectionSchema.id.getSolrFieldName()); + docListSearcher = new DocListSearcher(querystring, null, 0, 0, CollectionSchema.id.getSolrFieldName()); numFound = docListSearcher.response.matches(); } finally { if (docListSearcher != null) docListSearcher.close(); @@ -414,7 +407,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo int responseCount = 0; DocListSearcher docListSearcher = null; try { - docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); + docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, null, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); responseCount = docListSearcher.response.size(); if (responseCount == 0) return null; SolrIndexSearcher searcher = docListSearcher.request.getSearcher(); @@ -431,7 +424,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo } @Override - public BlockingQueue concurrentIDsByQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) { + public BlockingQueue concurrentIDsByQuery(final String querystring, final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) { final BlockingQueue queue = buffersize <= 0 ? new LinkedBlockingQueue() : new ArrayBlockingQueue(buffersize); final long endtime = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity! final Thread t = new Thread() { @@ -443,7 +436,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo while (System.currentTimeMillis() < endtime) { try { responseCount = 0; - docListSearcher = new DocListSearcher(querystring, o, pagesize, CollectionSchema.id.getSolrFieldName()); + docListSearcher = new DocListSearcher(querystring, sort, o, pagesize, CollectionSchema.id.getSolrFieldName()); responseCount = docListSearcher.response.size(); SolrIndexSearcher searcher = docListSearcher.request.getSearcher(); DocIterator iterator = docListSearcher.response.iterator(); diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index 045f0bee5..1820e4248 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -218,7 +218,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo * @throws IOException */ @Override - public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException { + public SolrDocumentList getDocumentListByQuery(final String querystring, final String sort, final int offset, final int count, final String ... fields) throws IOException { if (this.solr0 == null && this.solr1 == null) return new SolrDocumentList(); if (offset == 0 && count == 1 && querystring.startsWith("id:") && ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') || @@ -230,31 +230,31 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo return list; } if (this.solr0 != null && this.solr1 == null) { - SolrDocumentList list = this.solr0.getDocumentListByQuery(querystring, offset, count, fields); + SolrDocumentList list = this.solr0.getDocumentListByQuery(querystring, sort, offset, count, fields); return list; } if (this.solr1 != null && this.solr0 == null) { - SolrDocumentList list = this.solr1.getDocumentListByQuery(querystring, offset, count, fields); + SolrDocumentList list = this.solr1.getDocumentListByQuery(querystring, sort, offset, count, fields); return list; } // combine both lists SolrDocumentList l; - l = this.solr0.getDocumentListByQuery(querystring, offset, count, fields); + l = this.solr0.getDocumentListByQuery(querystring, sort, offset, count, fields); if (l.size() >= count) return l; // at this point we need to know how many results are in solr0 // compute this with a very bad hack; replace with better method later int size0 = 0; { //bad hack - TODO: replace - SolrDocumentList lHack = this.solr0.getDocumentListByQuery(querystring, 0, Integer.MAX_VALUE, fields); + SolrDocumentList lHack = this.solr0.getDocumentListByQuery(querystring, sort, 0, Integer.MAX_VALUE, fields); size0 = lHack.size(); } // now use the size of the first query to do a second query final SolrDocumentList list = new SolrDocumentList(); for (final SolrDocument d: l) list.add(d); - l = this.solr1.getDocumentListByQuery(querystring, offset + l.size() - size0, count - l.size(), fields); + l = this.solr1.getDocumentListByQuery(querystring, sort, offset + l.size() - size0, count - l.size(), fields); for (final SolrDocument d: l) list.add(d); return list; @@ -427,10 +427,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public BlockingQueue concurrentIDsByQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) { - if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency); - if (this.solr0 == null && this.solr1 != null) return this.solr1.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency); - return super.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency); + public BlockingQueue concurrentIDsByQuery(final String querystring, final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) { + if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency); + if (this.solr0 == null && this.solr1 != null) return this.solr1.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency); + return super.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency); } } diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index dcc9d1d06..ccd774e93 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -180,12 +180,18 @@ public interface SolrConnector extends Iterable /* Iterable of document * get a query result from solr * to get all results set the query String to "*:*" * @param querystring the solr query string + * @param sort the solr sort string, may be null to be not used * @param offset the first result offset * @param count number of wanted results * @param fields list of fields * @throws IOException */ - public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException, SolrException; + public SolrDocumentList getDocumentListByQuery( + final String querystring, + final String sort, + final int offset, + final int count, + final String ... fields) throws IOException, SolrException; /** * get the number of results when this query is done. @@ -210,6 +216,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. * The method returns immediately and feeds the search results into the queue * @param querystring the solr query string + * @param sort the solr sort string, may be null to be not used * @param offset first result offset * @param maxcount the maximum number of results * @param maxtime the maximum time in milliseconds @@ -220,6 +227,7 @@ public interface SolrConnector extends Iterable /* Iterable of document */ public BlockingQueue concurrentDocumentsByQuery( final String querystring, + final String sort, final int offset, final int maxcount, final long maxtime, @@ -232,6 +240,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * The result queue is considered as terminated if AbstractSolrConnector.POISON_ID is returned. * The method returns immediately and feeds the search results into the queue * @param querystring + * @param sort the solr sort string, may be null to be not used * @param offset * @param maxcount * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used @@ -240,6 +249,7 @@ public interface SolrConnector extends Iterable /* Iterable of document */ public BlockingQueue concurrentIDsByQuery( final String querystring, + final String sort, final int offset, final int maxcount, final long maxtime, diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 3e7780cab..78638f014 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -171,7 +171,7 @@ public class ErrorCache { } if (failDoc != null) return failDoc; try { - final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 0, 1); + final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, null, 0, 1); if (docs == null || docs.isEmpty()) return null; SolrDocument doc = docs.get(0); if (doc == null) return null; diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index e64178e1e..c78180a25 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -428,7 +428,7 @@ public final class Fulltext { final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); final AtomicInteger count = new AtomicInteger(0); - final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); + final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, null, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); try { Set deleteIDs = new HashSet(); SolrDocument doc; @@ -664,7 +664,7 @@ public final class Fulltext { this.count++; } } else { - BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, 1, + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, 10 * 60 * 60 * 1000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); SolrDocument doc; diff --git a/source/net/yacy/search/index/ReindexSolrBusyThread.java b/source/net/yacy/search/index/ReindexSolrBusyThread.java index bc1bd2f7b..88209f91e 100644 --- a/source/net/yacy/search/index/ReindexSolrBusyThread.java +++ b/source/net/yacy/search/index/ReindexSolrBusyThread.java @@ -113,7 +113,7 @@ import org.apache.solr.common.SolrInputDocument; if (sem.tryAcquire()) { try { String query = querylist.get(0); - SolrDocumentList xdocs = esc.getDocumentListByQuery(query, start, chunksize); + SolrDocumentList xdocs = esc.getDocumentListByQuery(query, null, start, chunksize); docstoreindex = (int) xdocs.getNumFound(); if (xdocs.size() == 0) { // no documents returned = all of current query reindexed (or eventual start to large) diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 5693a436b..dddb59c34 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -30,6 +30,7 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -80,6 +81,7 @@ import net.yacy.repository.LoaderDispatcher; import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; +import net.yacy.search.schema.HyperlinkGraph; import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphSchema; @@ -259,21 +261,13 @@ public class Segment { return 999; } + private static RowHandleSet getPossibleRootHashes(final DigestURL url) { RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : ""); try { rootCandidates.put(new DigestURL(rootStub).hash()); - rootCandidates.put(new DigestURL(rootStub + "/").hash()); - rootCandidates.put(new DigestURL(rootStub + "/index.htm").hash()); - rootCandidates.put(new DigestURL(rootStub + "/index.html").hash()); - rootCandidates.put(new DigestURL(rootStub + "/index.php").hash()); - rootCandidates.put(new DigestURL(rootStub + "/home.htm").hash()); - rootCandidates.put(new DigestURL(rootStub + "/home.html").hash()); - rootCandidates.put(new DigestURL(rootStub + "/home.php").hash()); - rootCandidates.put(new DigestURL(rootStub + "/default.htm").hash()); - rootCandidates.put(new DigestURL(rootStub + "/default.html").hash()); - rootCandidates.put(new DigestURL(rootStub + "/default.php").hash()); + for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash()); rootCandidates.optimize(); } catch (final Throwable e) {} rootCandidates.optimize(); @@ -310,22 +304,41 @@ public class Segment { public class ClickdepthCache { private final ReferenceReportCache rrc; + private final Map hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name private final Map cache; public final int maxdepth; // maximum clickdepth public final int maxtime; // maximum time to compute clickdepth public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) { this.rrc = rrc; + this.hyperlinkGraphCache = new HashMap(); this.cache = new ConcurrentHashMap(); this.maxdepth = maxdepth; this.maxtime = maxtime; } public int getClickdepth(final DigestURL url) throws IOException { + // first try: get the clickdepth from the cache Integer clickdepth = cache.get(ASCII.String(url.hash())); if (MemoryControl.shortStatus()) cache.clear(); if (clickdepth != null) { //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT"); return clickdepth.intValue(); } + + // second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth) + HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost()); + if (hlg == null) { + hlg = new HyperlinkGraph(); + hlg.fill(fulltext.getDefaultConnector(), url.getHost(), 300000, 10000000); + hlg.findLinkDepth(); + hyperlinkGraphCache.put(url.getHost(), hlg); + } + clickdepth = hlg.getDepth(url); + if (clickdepth != null) { + return clickdepth.intValue(); + } + + + // third try: get the clickdepth from a reverse link graph clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth); //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth); this.cache.put(ASCII.String(url.hash()), clickdepth); @@ -375,7 +388,7 @@ public class Segment { if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) { // reqd the references from the webgraph SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); - BlockingQueue docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName()); + BlockingQueue docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName()); SolrDocument doc; try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @@ -478,12 +491,12 @@ public class Segment { final BlockingQueue docQueue; final String urlstub; if (stub == null) { - docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(AbstractSolrConnector.CATCHALL_QUERY, 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); + docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(AbstractSolrConnector.CATCHALL_QUERY, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); urlstub = null; } else { final String host = stub.getHost(); String hh = DigestURL.hosthash(host); - docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); + docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); urlstub = stub.toNormalform(true); } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 461bcd69f..ca651b56b 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -962,7 +962,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; long patchquerycount = collectionConnector.getCountByQuery(patchquery); - BlockingQueue documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 600000, 200, 1, + BlockingQueue documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 600000, 200, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName()); SolrDocument doc_B; int patchquerycountcheck = 0; @@ -1044,7 +1044,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query); int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4)); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency); - final BlockingQueue docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 200, concurrency); + final BlockingQueue docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1800000, 200, concurrency); final AtomicInteger proccount = new AtomicInteger(0); Thread[] t = new Thread[concurrency]; for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) { @@ -1151,7 +1151,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri long count = collectionConnector.getCountByQuery(query); long start = System.currentTimeMillis(); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); - BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 200, 1); + BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery(query, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1800000, 200, 1); int countcheck = 0; Collection failids = new ArrayList(); SolrDocument doc; @@ -1274,7 +1274,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri this.crt = new ConcurrentHashMap(); try { // select all documents for each host - BlockingQueue ids = connector.concurrentIDsByQuery("{!raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, 0, 10000000, 600000, 200, 1); + BlockingQueue ids = connector.concurrentIDsByQuery("{!raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 600000, 200, 1); String id; while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) { this.crt.put(id, new double[]{0.0d,0.0d}); //{old value, new value} diff --git a/source/net/yacy/search/schema/HyperlinkEdge.java b/source/net/yacy/search/schema/HyperlinkEdge.java index 587699ae0..6526f0871 100644 --- a/source/net/yacy/search/schema/HyperlinkEdge.java +++ b/source/net/yacy/search/schema/HyperlinkEdge.java @@ -37,4 +37,16 @@ public class HyperlinkEdge { this.type = type; } + @Override + public String toString() { + StringBuilder sb = new StringBuilder(120); + sb.append(this.source.toNormalform(true)); + sb.append(" -> "); + sb.append(this.target.toNormalform(true)); + sb.append(" ("); + sb.append(type.name()); + sb.append(")"); + return sb.toString(); + } + } diff --git a/source/net/yacy/search/schema/HyperlinkGraph.java b/source/net/yacy/search/schema/HyperlinkGraph.java new file mode 100644 index 000000000..312f70674 --- /dev/null +++ b/source/net/yacy/search/schema/HyperlinkGraph.java @@ -0,0 +1,197 @@ +/** + * HyperlinkGraph + * Copyright 2014 by Michael Peter Christen + * First released 08.04.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.schema; + +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.BlockingQueue; + +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailType; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.kelondro.data.meta.URIMetadataNode; + +import org.apache.solr.common.SolrDocument; + + +public class HyperlinkGraph implements Iterable { + + public final static Set ROOTFNS = new HashSet(); + static { + for (String s: new String[]{"/", "/index.htm", "/index.html", "/index.php", "/home.htm", "/home.html", "/home.php", "/default.htm", "/default.html", "/default.php"}) { + ROOTFNS.add(s); + } + } + + Map edges; + Map depths; + String hostname; + + public HyperlinkGraph() { + this.edges = new LinkedHashMap(); + this.depths = new HashMap(); + this.hostname = null; + } + + public void fill(final SolrConnector solrConnector, String hostname, final int maxtime, final int maxnodes) { + this.hostname = hostname; + if (hostname.startsWith("www.")) hostname = hostname.substring(4); + StringBuilder q = new StringBuilder(); + q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname); + BlockingQueue docs = solrConnector.concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, 100, 1, + CollectionSchema.id.getSolrFieldName(), + CollectionSchema.sku.getSolrFieldName(), + CollectionSchema.failreason_s.getSolrFieldName(), + CollectionSchema.failtype_s.getSolrFieldName(), + CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(), + CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(), + CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), + CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() + ); + SolrDocument doc; + Map errorDocs = new HashMap(); + Map inboundEdges = new HashMap(); + Map outboundEdges = new HashMap(); + Map errorEdges = new HashMap(); + try { + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); + String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); + DigestURL from = new DigestURL(u, ASCII.getBytes(ids)); + String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); + FailType error = errortype == null ? null : FailType.valueOf(errortype); + if (error != null) { + errorDocs.put(u, error); + } else { + Iterator links = URIMetadataNode.getLinks(doc, true); // inbound + String link; + while (links.hasNext()) { + link = links.next(); + try { + DigestURL linkurl = new DigestURL(link, null); + String edgehash = ids + ASCII.String(linkurl.hash()); + inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound)); + } catch (MalformedURLException e) {} + } + links = URIMetadataNode.getLinks(doc, false); // outbound + while (links.hasNext()) { + link = links.next(); + try { + DigestURL linkurl = new DigestURL(link, null); + String edgehash = ids + ASCII.String(linkurl.hash()); + outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound)); + } catch (MalformedURLException e) {} + } + } + if (inboundEdges.size() + outboundEdges.size() > maxnodes) { + break; + } + } + } catch (InterruptedException e) { + } catch (MalformedURLException e) { + } + // we use the errorDocs to mark all edges with endpoint to error documents + Iterator> i = inboundEdges.entrySet().iterator(); + Map.Entry edge; + while (i.hasNext()) { + edge = i.next(); + if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) { + i.remove(); + edge.getValue().type = HyperlinkEdge.Type.Dead; + errorEdges.put(edge.getKey(), edge.getValue()); + } + } + i = outboundEdges.entrySet().iterator(); + while (i.hasNext()) { + edge = i.next(); + if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) { + i.remove(); + edge.getValue().type = HyperlinkEdge.Type.Dead; + errorEdges.put(edge.getKey(), edge.getValue()); + } + } + // we put all edges together in a specific order which is used to create nodes in a svg display: + // notes that appear first are possible painted over by nodes coming later. + // less important nodes shall appear therefore first + this.edges.putAll(outboundEdges); + this.edges.putAll(inboundEdges); + this.edges.putAll(errorEdges); + } + + public int findLinkDepth() { + + int remaining = this.edges.size(); + + // first find root nodes + Set nodes = new HashSet(); + Set nextnodes = new HashSet(); + for (HyperlinkEdge edge: this.edges.values()) { + String path = edge.source.getPath(); + if (ROOTFNS.contains(path)) { + if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, 0); + if (edge.type == HyperlinkEdge.Type.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, 1); + nodes.add(edge.source); + nextnodes.add(edge.target); + remaining--; + } + } + if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges"); + + // recusively step into depth and find next level + int depth = 1; + while (remaining > 0) { + boolean found = false; + nodes = nextnodes; + nextnodes = new HashSet(); + for (HyperlinkEdge edge: this.edges.values()) { + if (nodes.contains(edge.source)) { + if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, depth); + if (edge.type == HyperlinkEdge.Type.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, depth + 1); + nextnodes.add(edge.target); + remaining--; + found = true; + } + } + depth++; + if (!found) break; // terminating in case that not all edges are linked together + } + if (remaining > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find all edges for " + hostname + ", " + remaining + " remaining."); + return depth - 1; + } + + public Integer getDepth(DigestURL url) { + return this.depths.get(url); + } + + @Override + public Iterator iterator() { + return this.edges.values().iterator(); + } + +}