new structure and enhancements for link graph computation:

- added order option to solr queries to be able to retrieve document
lists in specific order, here: link length
- added HyperlinkEdge class which manages the link structure
- integrated the HyperlinkEdge class into clickdepth computation
- extended the linkstructure.json servlet to show also the clickdepth
and other statistic information
pull/1/head
Michael Peter Christen 11 years ago
parent df138084c0
commit bd886054cb

@ -288,7 +288,7 @@ public class HostBrowser {
q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
}
}
BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, 100000, TIMEOUT, 100, 1,
BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000, TIMEOUT, 100, 1,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),

@ -130,7 +130,7 @@ public class IndexDeletion_p {
}
try {
DigestURL u = new DigestURL(urlStub);
BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", 0, 100000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
BlockingQueue<SolrDocument> dq = defaultConnector.concurrentDocumentsByQuery(CollectionSchema.host_s.getSolrFieldName() + ":\"" + u.getHost() + "\"", null, 0, 100000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
SolrDocument doc;
try {
while ((doc = dq.take()) != AbstractSolrConnector.POISON_DOCUMENT) {

@ -127,7 +127,7 @@ public class citation {
}
try {
sentence = sentence.replace('"', '\'');
SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName());
SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100, CollectionSchema.sku.getSolrFieldName());
int count = (int) doclist.getNumFound();
if (count > 0) {
Set<DigestURL> list = new TreeSet<DigestURL>();

@ -17,29 +17,18 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.HyperlinkEdge;
import net.yacy.search.schema.HyperlinkGraph;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;
@ -59,8 +48,8 @@ public class linkstructure {
String about = post.get("about", null); // may be a URL, a URL hash or a domain hash
if (about == null) return prop;
boolean authenticated = sb.adminAuthenticated(header) >= 2;
int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 60000 : 1000);
int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 1000 : 100);
int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 300000 : 1000);
int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 10000000 : 100);
DigestURL url = null;
String hostname = null;
@ -72,104 +61,32 @@ public class linkstructure {
try {
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
hostname = url.getHost();
if (hostname.startsWith("www.")) hostname = hostname.substring(4);
} catch (final MalformedURLException e) {
}
}
if (hostname == null) return prop;
// now collect _all_ documents inside the domain until a timeout appears
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, maxnodes, maxtime, 100, 1,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
CollectionSchema.failtype_s.getSolrFieldName(),
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
);
SolrDocument doc;
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
if (error != null) {
errorDocs.put(u, error);
} else {
Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound
String link;
while (links.hasNext()) {
link = links.next();
try {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));
} catch (MalformedURLException e) {}
}
links = URIMetadataNode.getLinks(doc, false); // outbound
while (links.hasNext()) {
link = links.next();
try {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));
} catch (MalformedURLException e) {}
}
}
if (inboundEdges.size() + outboundEdges.size() > maxnodes) break;
}
} catch (InterruptedException e) {
} catch (MalformedURLException e) {
}
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();
Map.Entry<String, HyperlinkEdge> edge;
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
i.remove();
edge.getValue().type = HyperlinkEdge.Type.Dead;
errorEdges.put(edge.getKey(), edge.getValue());
}
}
i = outboundEdges.entrySet().iterator();
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
i.remove();
edge.getValue().type = HyperlinkEdge.Type.Dead;
errorEdges.put(edge.getKey(), edge.getValue());
}
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
Map<String, HyperlinkEdge> edges = new LinkedHashMap<String, HyperlinkEdge>();
edges.putAll(outboundEdges);
edges.putAll(inboundEdges);
edges.putAll(errorEdges);
HyperlinkGraph hlg = new HyperlinkGraph();
hlg.fill(fulltext.getDefaultConnector(), hostname, maxtime, maxnodes);
int maxdepth = hlg.findLinkDepth();
// finally just write out the edge array
int c = 0;
for (Map.Entry<String, HyperlinkEdge> e: edges.entrySet()) {
prop.putJSON("list_" + c + "_source", e.getValue().source.getPath());
prop.putJSON("list_" + c + "_target", e.getValue().type.equals(HyperlinkEdge.Type.Outbound) ? e.getValue().target.toNormalform(true) : e.getValue().target.getPath());
prop.putJSON("list_" + c + "_type", e.getValue().type.name());
prop.put("list_" + c + "_eol", 1);
for (HyperlinkEdge e: hlg) {
prop.putJSON("edges_" + c + "_source", e.source.getPath());
prop.putJSON("edges_" + c + "_target", e.type.equals(HyperlinkEdge.Type.Outbound) ? e.target.toNormalform(true) : e.target.getPath());
prop.putJSON("edges_" + c + "_type", e.type.name());
Integer depth_source = hlg.getDepth(e.source);
Integer depth_target = hlg.getDepth(e.target);
prop.put("edges_" + c + "_depthSource", depth_source == null ? -1 : depth_source.intValue());
prop.put("edges_" + c + "_depthTarget", depth_target == null ? -1 : depth_target.intValue());
prop.put("edges_" + c + "_eol", 1);
c++;
}
prop.put("list_" + (c-1) + "_eol", 0);
prop.put("list", c);
prop.put("edges_" + (c-1) + "_eol", 0);
prop.put("edges", c);
prop.put("maxdepth", maxdepth);
// Adding CORS Access header for xml output
if (xml) {

@ -1,5 +1,7 @@
[
#{list}#
{"source":"#[source]#", "target":"#[target]#", "type":"#[type]#"}#(eol)#::,#(/eol)#
#{/list}#
]
{
"edges" : "#[edges]#",
"maxdepth" : "#[maxdepth]#",
"graph" : [#{edges}#
{"source":"#[source]#", "target":"#[target]#", "type":"#[type]#", "depthSource":"#[depthSource]#", "depthTarget":"#[depthTarget]#"}#(eol)#::,#(/eol)#
#{/edges}#]
}

@ -1,7 +1,9 @@
function linkstructure(hostname, element, width, height, maxtime, maxnodes) {
var nodes = {};
var links = [];
$.getJSON("/api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(links) {
var linkstructure = {};
$.getJSON("/api/linkstructure.json?about=" + hostname + "&maxtime=" + maxtime + "&maxnodes=" + maxnodes, function(linkstructure) {
links = linkstructure.graph;
links.forEach(function(link) {
link.source = nodes[link.source] || (nodes[link.source] = {name: link.source, type:"Inbound"});
link.target = nodes[link.target] || (nodes[link.target] = {name: link.target, type:link.type});

@ -201,7 +201,7 @@ public class OpenSearchConnector {
final long numfound;
try {
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, 0, 1, webgraphqueryfields);
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
numfound = docList.getNumFound();
if (numfound == 0) {
ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job");
@ -226,7 +226,7 @@ public class OpenSearchConnector {
Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
while (doloop) {
ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents
loopnr++;
if (stoptime < System.currentTimeMillis()) {// stop after max 1h
doloop = false;

@ -158,7 +158,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
continue uniquecheck;
}
try {
final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", 0, 1);
final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", null, 0, 1);
if (docs != null && !docs.isEmpty()) {
SolrDocument doc = docs.get(0);
// switch unique attribute in new document

@ -134,6 +134,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring the solr query string
* @param sort the solr sort string, may be null to be not used
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
@ -144,6 +145,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(
final String querystring,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
@ -160,7 +162,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
int count = 0;
while (System.currentTimeMillis() < endtime && count < maxcount) {
try {
SolrDocumentList sdl = getDocumentListByQuery(querystring, o, Math.min(maxcount, pagesize), fields);
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize), fields);
for (SolrDocument d: sdl) {
try {queue.put(d);} catch (final InterruptedException e) {break;}
count++;
@ -185,6 +187,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
@Override
public BlockingQueue<String> concurrentIDsByQuery(
final String querystring,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
@ -199,7 +202,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
int o = offset;
while (System.currentTimeMillis() < endtime) {
try {
SolrDocumentList sdl = getDocumentListByQuery(querystring, o, Math.min(maxcount, pagesize), CollectionSchema.id.getSolrFieldName());
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize), CollectionSchema.id.getSolrFieldName());
for (SolrDocument d: sdl) {
try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;}
}
@ -222,7 +225,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
@Override
public Iterator<String> iterator() {
final BlockingQueue<String> queue = concurrentIDsByQuery(CATCHALL_QUERY, 0, Integer.MAX_VALUE, 60000, 2 * pagesize, 1);
final BlockingQueue<String> queue = concurrentIDsByQuery(CATCHALL_QUERY, null, 0, Integer.MAX_VALUE, 60000, 2 * pagesize, 1);
return new LookAheadIterator<String>() {
@Override
protected String next0() {
@ -245,22 +248,43 @@ public abstract class AbstractSolrConnector implements SolrConnector {
* @throws IOException
*/
@Override
public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException {
public SolrDocumentList getDocumentListByQuery(
final String querystring,
final String sort,
final int offset,
final int count,
final String ... fields) throws IOException {
// construct query
final SolrQuery params = getSolrQuery(querystring, sort, offset, count, fields);
// query the server
final SolrDocumentList docs = getDocumentListByParams(params);
return docs;
}
public static SolrQuery getSolrQuery(
final String querystring,
final String sort,
final int offset,
final int count,
final String ... fields) {
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(querystring);
params.clearSorts();
if (sort != null) {
params.set("sort", sort);
}
params.setRows(count);
params.setStart(offset);
params.setFacet(false);
params.clearSorts();
if (fields.length > 0) params.setFields(fields);
params.setIncludeScore(false);
// query the server
final SolrDocumentList docs = getDocumentListByParams(params);
return docs;
return params;
}
@Override
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException {
final SolrDocumentList sdl = getDocumentListByParams(params);

@ -211,7 +211,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
* @throws IOException
*/
@Override
public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException {
public SolrDocumentList getDocumentListByQuery(final String querystring, final String sort, final int offset, final int count, final String ... fields) throws IOException {
if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
querystring.length() == 15)) {
@ -222,14 +222,14 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
return list;
}
if (this.solr != null) {
SolrDocumentList list = this.solr.getDocumentListByQuery(querystring, offset, count, fields);
SolrDocumentList list = this.solr.getDocumentListByQuery(querystring, sort, offset, count, fields);
addToCache(list, fields.length == 0);
return list;
}
// combine both lists
SolrDocumentList list;
list = this.solr.getDocumentListByQuery(querystring, offset, count, fields);
list = this.solr.getDocumentListByQuery(querystring, sort, offset, count, fields);
// add caching
addToCache(list, fields.length == 0);

@ -382,7 +382,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
@Override
public SolrDocumentList getDocumentListByQuery(String querystring, int offset, int count, String... fields) throws IOException, SolrException {
public SolrDocumentList getDocumentListByQuery(String querystring, String sort, int offset, int count, String... fields) throws IOException, SolrException {
if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
querystring.length() == 15)) {
@ -392,7 +392,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return list;
}
SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, sort, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
/*
Iterator<SolrDocument> i = sdl.iterator();
while (i.hasNext()) {
@ -415,13 +415,13 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) {
return this.connector.concurrentDocumentsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency, fields);
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) {
return this.connector.concurrentDocumentsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency, fields);
}
@Override
public BlockingQueue<String> concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) {
return this.connector.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
public BlockingQueue<String> concurrentIDsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) {
return this.connector.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
}
}

@ -360,16 +360,9 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
private SolrQueryRequest request;
private DocList response;
public DocListSearcher(final String querystring, final int offset, final int count, final String ... fields) {
public DocListSearcher(final String querystring, String sort, final int offset, final int count, final String ... fields) {
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(querystring);
params.setRows(count);
params.setStart(offset);
params.setFacet(false);
params.clearSorts();
if (fields.length > 0) params.setFields(fields);
params.setIncludeScore(false);
final SolrQuery params = AbstractSolrConnector.getSolrQuery(querystring, sort, offset, count, fields);
// query the server
this.request = EmbeddedSolrConnector.this.request(params);
@ -395,7 +388,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
int numFound = 0;
DocListSearcher docListSearcher = null;
try {
docListSearcher = new DocListSearcher(querystring, 0, 0, CollectionSchema.id.getSolrFieldName());
docListSearcher = new DocListSearcher(querystring, null, 0, 0, CollectionSchema.id.getSolrFieldName());
numFound = docListSearcher.response.matches();
} finally {
if (docListSearcher != null) docListSearcher.close();
@ -414,7 +407,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
int responseCount = 0;
DocListSearcher docListSearcher = null;
try {
docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, null, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
responseCount = docListSearcher.response.size();
if (responseCount == 0) return null;
SolrIndexSearcher searcher = docListSearcher.request.getSearcher();
@ -431,7 +424,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
}
@Override
public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
final BlockingQueue<String> queue = buffersize <= 0 ? new LinkedBlockingQueue<String>() : new ArrayBlockingQueue<String>(buffersize);
final long endtime = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; // we know infinity!
final Thread t = new Thread() {
@ -443,7 +436,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
while (System.currentTimeMillis() < endtime) {
try {
responseCount = 0;
docListSearcher = new DocListSearcher(querystring, o, pagesize, CollectionSchema.id.getSolrFieldName());
docListSearcher = new DocListSearcher(querystring, sort, o, pagesize, CollectionSchema.id.getSolrFieldName());
responseCount = docListSearcher.response.size();
SolrIndexSearcher searcher = docListSearcher.request.getSearcher();
DocIterator iterator = docListSearcher.response.iterator();

@ -218,7 +218,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
* @throws IOException
*/
@Override
public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException {
public SolrDocumentList getDocumentListByQuery(final String querystring, final String sort, final int offset, final int count, final String ... fields) throws IOException {
if (this.solr0 == null && this.solr1 == null) return new SolrDocumentList();
if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
@ -230,31 +230,31 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
return list;
}
if (this.solr0 != null && this.solr1 == null) {
SolrDocumentList list = this.solr0.getDocumentListByQuery(querystring, offset, count, fields);
SolrDocumentList list = this.solr0.getDocumentListByQuery(querystring, sort, offset, count, fields);
return list;
}
if (this.solr1 != null && this.solr0 == null) {
SolrDocumentList list = this.solr1.getDocumentListByQuery(querystring, offset, count, fields);
SolrDocumentList list = this.solr1.getDocumentListByQuery(querystring, sort, offset, count, fields);
return list;
}
// combine both lists
SolrDocumentList l;
l = this.solr0.getDocumentListByQuery(querystring, offset, count, fields);
l = this.solr0.getDocumentListByQuery(querystring, sort, offset, count, fields);
if (l.size() >= count) return l;
// at this point we need to know how many results are in solr0
// compute this with a very bad hack; replace with better method later
int size0 = 0;
{ //bad hack - TODO: replace
SolrDocumentList lHack = this.solr0.getDocumentListByQuery(querystring, 0, Integer.MAX_VALUE, fields);
SolrDocumentList lHack = this.solr0.getDocumentListByQuery(querystring, sort, 0, Integer.MAX_VALUE, fields);
size0 = lHack.size();
}
// now use the size of the first query to do a second query
final SolrDocumentList list = new SolrDocumentList();
for (final SolrDocument d: l) list.add(d);
l = this.solr1.getDocumentListByQuery(querystring, offset + l.size() - size0, count - l.size(), fields);
l = this.solr1.getDocumentListByQuery(querystring, sort, offset + l.size() - size0, count - l.size(), fields);
for (final SolrDocument d: l) list.add(d);
return list;
@ -427,10 +427,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
if (this.solr0 == null && this.solr1 != null) return this.solr1.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
return super.concurrentIDsByQuery(querystring, offset, maxcount, maxtime, buffersize, concurrency);
public BlockingQueue<String> concurrentIDsByQuery(final String querystring, final String sort, final int offset, final int maxcount, final long maxtime, final int buffersize, final int concurrency) {
if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
if (this.solr0 == null && this.solr1 != null) return this.solr1.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
return super.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
}
}

@ -180,12 +180,18 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring the solr query string
* @param sort the solr sort string, may be null to be not used
* @param offset the first result offset
* @param count number of wanted results
* @param fields list of fields
* @throws IOException
*/
public SolrDocumentList getDocumentListByQuery(final String querystring, final int offset, final int count, final String ... fields) throws IOException, SolrException;
public SolrDocumentList getDocumentListByQuery(
final String querystring,
final String sort,
final int offset,
final int count,
final String ... fields) throws IOException, SolrException;
/**
* get the number of results when this query is done.
@ -210,6 +216,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring the solr query string
* @param sort the solr sort string, may be null to be not used
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
@ -220,6 +227,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
*/
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(
final String querystring,
final String sort,
final int offset,
final int maxcount,
final long maxtime,
@ -232,6 +240,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* The result queue is considered as terminated if AbstractSolrConnector.POISON_ID is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring
* @param sort the solr sort string, may be null to be not used
* @param offset
* @param maxcount
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
@ -240,6 +249,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
*/
public BlockingQueue<String> concurrentIDsByQuery(
final String querystring,
final String sort,
final int offset,
final int maxcount,
final long maxtime,

@ -171,7 +171,7 @@ public class ErrorCache {
}
if (failDoc != null) return failDoc;
try {
final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, 0, 1);
final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, null, 0, 1);
if (docs == null || docs.isEmpty()) return null;
SolrDocument doc = docs.get(0);
if (doc == null) return null;

@ -428,7 +428,7 @@ public final class Fulltext {
final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, null, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
try {
Set<String> deleteIDs = new HashSet<String>();
SolrDocument doc;
@ -664,7 +664,7 @@ public final class Fulltext {
this.count++;
}
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, 1,
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, 10 * 60 * 60 * 1000, 100, 1,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc;

@ -113,7 +113,7 @@ import org.apache.solr.common.SolrInputDocument;
if (sem.tryAcquire()) {
try {
String query = querylist.get(0);
SolrDocumentList xdocs = esc.getDocumentListByQuery(query, start, chunksize);
SolrDocumentList xdocs = esc.getDocumentListByQuery(query, null, start, chunksize);
docstoreindex = (int) xdocs.getNumFound();
if (xdocs.size() == 0) { // no documents returned = all of current query reindexed (or eventual start to large)

@ -30,6 +30,7 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@ -80,6 +81,7 @@ import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.HyperlinkGraph;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
@ -259,21 +261,13 @@ public class Segment {
return 999;
}
private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
try {
rootCandidates.put(new DigestURL(rootStub).hash());
rootCandidates.put(new DigestURL(rootStub + "/").hash());
rootCandidates.put(new DigestURL(rootStub + "/index.htm").hash());
rootCandidates.put(new DigestURL(rootStub + "/index.html").hash());
rootCandidates.put(new DigestURL(rootStub + "/index.php").hash());
rootCandidates.put(new DigestURL(rootStub + "/home.htm").hash());
rootCandidates.put(new DigestURL(rootStub + "/home.html").hash());
rootCandidates.put(new DigestURL(rootStub + "/home.php").hash());
rootCandidates.put(new DigestURL(rootStub + "/default.htm").hash());
rootCandidates.put(new DigestURL(rootStub + "/default.html").hash());
rootCandidates.put(new DigestURL(rootStub + "/default.php").hash());
for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash());
rootCandidates.optimize();
} catch (final Throwable e) {}
rootCandidates.optimize();
@ -310,22 +304,41 @@ public class Segment {
public class ClickdepthCache {
private final ReferenceReportCache rrc;
private final Map<String, HyperlinkGraph> hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name
private final Map<String, Integer> cache;
public final int maxdepth; // maximum clickdepth
public final int maxtime; // maximum time to compute clickdepth
public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
this.rrc = rrc;
this.hyperlinkGraphCache = new HashMap<String, HyperlinkGraph>();
this.cache = new ConcurrentHashMap<String, Integer>();
this.maxdepth = maxdepth;
this.maxtime = maxtime;
}
public int getClickdepth(final DigestURL url) throws IOException {
// first try: get the clickdepth from the cache
Integer clickdepth = cache.get(ASCII.String(url.hash()));
if (MemoryControl.shortStatus()) cache.clear();
if (clickdepth != null) {
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
return clickdepth.intValue();
}
// second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth)
HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost());
if (hlg == null) {
hlg = new HyperlinkGraph();
hlg.fill(fulltext.getDefaultConnector(), url.getHost(), 300000, 10000000);
hlg.findLinkDepth();
hyperlinkGraphCache.put(url.getHost(), hlg);
}
clickdepth = hlg.getDepth(url);
if (clickdepth != null) {
return clickdepth.intValue();
}
// third try: get the clickdepth from a reverse link graph
clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
this.cache.put(ASCII.String(url.hash()), clickdepth);
@ -375,7 +388,7 @@ public class Segment {
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName());
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName());
SolrDocument doc;
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
@ -478,12 +491,12 @@ public class Segment {
final BlockingQueue<SolrDocument> docQueue;
final String urlstub;
if (stub == null) {
docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(AbstractSolrConnector.CATCHALL_QUERY, 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(AbstractSolrConnector.CATCHALL_QUERY, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
urlstub = null;
} else {
final String host = stub.getHost();
String hh = DigestURL.hosthash(host);
docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
docQueue = this.fulltext.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.host_id_s + ":\"" + hh + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, Integer.MAX_VALUE, maxtime, maxcount, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
urlstub = stub.toNormalform(true);
}

@ -962,7 +962,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
long patchquerycount = collectionConnector.getCountByQuery(patchquery);
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 600000, 200, 1,
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 600000, 200, 1,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B;
int patchquerycountcheck = 0;
@ -1044,7 +1044,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query);
int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency);
final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 200, concurrency);
final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1800000, 200, concurrency);
final AtomicInteger proccount = new AtomicInteger(0);
Thread[] t = new Thread[concurrency];
for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) {
@ -1151,7 +1151,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
long count = collectionConnector.getCountByQuery(query);
long start = System.currentTimeMillis();
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 200, 1);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1800000, 200, 1);
int countcheck = 0;
Collection<String> failids = new ArrayList<String>();
SolrDocument doc;
@ -1274,7 +1274,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
this.crt = new ConcurrentHashMap<String, double[]>();
try {
// select all documents for each host
BlockingQueue<String> ids = connector.concurrentIDsByQuery("{!raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, 0, 10000000, 600000, 200, 1);
BlockingQueue<String> ids = connector.concurrentIDsByQuery("{!raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 10000000, 600000, 200, 1);
String id;
while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) {
this.crt.put(id, new double[]{0.0d,0.0d}); //{old value, new value}

@ -37,4 +37,16 @@ public class HyperlinkEdge {
this.type = type;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(120);
sb.append(this.source.toNormalform(true));
sb.append(" -> ");
sb.append(this.target.toNormalform(true));
sb.append(" (");
sb.append(type.name());
sb.append(")");
return sb.toString();
}
}

@ -0,0 +1,197 @@
/**
* HyperlinkGraph
* Copyright 2014 by Michael Peter Christen
* First released 08.04.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.schema;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import org.apache.solr.common.SolrDocument;
public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
public final static Set<String> ROOTFNS = new HashSet<String>();
static {
for (String s: new String[]{"/", "/index.htm", "/index.html", "/index.php", "/home.htm", "/home.html", "/home.php", "/default.htm", "/default.html", "/default.php"}) {
ROOTFNS.add(s);
}
}
Map<String, HyperlinkEdge> edges;
Map<DigestURL, Integer> depths;
String hostname;
public HyperlinkGraph() {
this.edges = new LinkedHashMap<String, HyperlinkEdge>();
this.depths = new HashMap<DigestURL, Integer>();
this.hostname = null;
}
public void fill(final SolrConnector solrConnector, String hostname, final int maxtime, final int maxnodes) {
this.hostname = hostname;
if (hostname.startsWith("www.")) hostname = hostname.substring(4);
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
BlockingQueue<SolrDocument> docs = solrConnector.concurrentDocumentsByQuery(q.toString(), CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, maxnodes, maxtime, 100, 1,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
CollectionSchema.failtype_s.getSolrFieldName(),
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
);
SolrDocument doc;
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
if (error != null) {
errorDocs.put(u, error);
} else {
Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound
String link;
while (links.hasNext()) {
link = links.next();
try {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));
} catch (MalformedURLException e) {}
}
links = URIMetadataNode.getLinks(doc, false); // outbound
while (links.hasNext()) {
link = links.next();
try {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));
} catch (MalformedURLException e) {}
}
}
if (inboundEdges.size() + outboundEdges.size() > maxnodes) {
break;
}
}
} catch (InterruptedException e) {
} catch (MalformedURLException e) {
}
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();
Map.Entry<String, HyperlinkEdge> edge;
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
i.remove();
edge.getValue().type = HyperlinkEdge.Type.Dead;
errorEdges.put(edge.getKey(), edge.getValue());
}
}
i = outboundEdges.entrySet().iterator();
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
i.remove();
edge.getValue().type = HyperlinkEdge.Type.Dead;
errorEdges.put(edge.getKey(), edge.getValue());
}
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
this.edges.putAll(outboundEdges);
this.edges.putAll(inboundEdges);
this.edges.putAll(errorEdges);
}
public int findLinkDepth() {
int remaining = this.edges.size();
// first find root nodes
Set<DigestURL> nodes = new HashSet<DigestURL>();
Set<DigestURL> nextnodes = new HashSet<DigestURL>();
for (HyperlinkEdge edge: this.edges.values()) {
String path = edge.source.getPath();
if (ROOTFNS.contains(path)) {
if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, 0);
if (edge.type == HyperlinkEdge.Type.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, 1);
nodes.add(edge.source);
nextnodes.add(edge.target);
remaining--;
}
}
if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
// recusively step into depth and find next level
int depth = 1;
while (remaining > 0) {
boolean found = false;
nodes = nextnodes;
nextnodes = new HashSet<DigestURL>();
for (HyperlinkEdge edge: this.edges.values()) {
if (nodes.contains(edge.source)) {
if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, depth);
if (edge.type == HyperlinkEdge.Type.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, depth + 1);
nextnodes.add(edge.target);
remaining--;
found = true;
}
}
depth++;
if (!found) break; // terminating in case that not all edges are linked together
}
if (remaining > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find all edges for " + hostname + ", " + remaining + " remaining.");
return depth - 1;
}
public Integer getDepth(DigestURL url) {
return this.depths.get(url);
}
@Override
public Iterator<HyperlinkEdge> iterator() {
return this.edges.values().iterator();
}
}
Loading…
Cancel
Save