linkstructure refactoring to get more options for clickdepth analysis

pull/1/head
orbiter 11 years ago
parent 8068e68474
commit c250fac9f4

@ -27,6 +27,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.schema.HyperlinkEdge;
import net.yacy.search.schema.HyperlinkGraph;
import net.yacy.server.serverObjects;
@ -45,33 +46,53 @@ public class linkstructure {
final Switchboard sb = (Switchboard) env;
Fulltext fulltext = sb.index.fulltext();
if (post == null) return prop;
String about = post.get("about", null); // may be a URL, a URL hash or a domain hash
if (about == null) return prop;
boolean authenticated = sb.adminAuthenticated(header) >= 2;
int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 300000 : 1000);
int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 10000000 : 100);
DigestURL url = null;
String hostname = null;
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
byte[] urlhash = ASCII.getBytes(about);
url = authenticated ? sb.getURL(urlhash) : null;
} else if (url == null && about.length() > 0) {
// consider "about" as url or hostname
try {
HyperlinkGraph hlg = new HyperlinkGraph();
int maxdepth = 0;
if (post.get("about", null) != null) try {
// get link structure within a host
String about = post.get("about", null); // may be a URL, a URL hash or a domain hash
DigestURL url = null;
String hostname = null;
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
byte[] urlhash = ASCII.getBytes(about);
url = authenticated ? sb.getURL(urlhash) : null;
} else if (url == null && about.length() > 0) { // consider "about" as url or hostname
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
hostname = url.getHost();
} catch (final MalformedURLException e) {
}
}
if (hostname == null) return prop;
// now collect _all_ documents inside the domain until a timeout appears
HyperlinkGraph hlg = new HyperlinkGraph();
hlg.fill(fulltext.getDefaultConnector(), hostname, maxtime, maxnodes);
int maxdepth = hlg.findLinkDepth();
if (hostname == null) return prop;
// now collect _all_ documents inside the domain until a timeout appears
hlg.fill(fulltext.getDefaultConnector(), hostname, null, maxtime, maxnodes);
maxdepth = hlg.findLinkDepth();
} catch (final MalformedURLException e) {}
else if (post.get("to", null) != null) try {
// get link structure between two links
DigestURL to = new DigestURL(post.get("to", null), null); // must be an url
DigestURL from = post.get("from", null) == null ? null : new DigestURL(post.get("from", null)); // can be null or must be an url
ReferenceReportCache rrc = sb.index.getReferenceReportCache();
hlg.path(sb.index, rrc, from, to, maxtime, maxnodes);
} catch (final MalformedURLException e) {}
// finally just write out the edge array
writeGraph(prop, hlg, maxdepth);
// Adding CORS Access header for xml output
if (xml) {
final ResponseHeader outgoingHeader = new ResponseHeader(200);
outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");
prop.setOutgoingHeader(outgoingHeader);
}
// return rewrite properties
return prop;
}
private static void writeGraph(final servletProperties prop, final HyperlinkGraph hlg, final int maxdepth) {
int c = 0;
for (HyperlinkEdge e: hlg) {
prop.putJSON("edges_" + c + "_source", e.source.getPath());
@ -87,16 +108,6 @@ public class linkstructure {
prop.put("edges_" + (c-1) + "_eol", 0);
prop.put("edges", c);
prop.put("maxdepth", maxdepth);
// Adding CORS Access header for xml output
if (xml) {
final ResponseHeader outgoingHeader = new ResponseHeader(200);
outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");
prop.setOutgoingHeader(outgoingHeader);
}
// return rewrite properties
return prop;
}
}

@ -328,7 +328,7 @@ public class Segment {
HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost());
if (hlg == null) {
hlg = new HyperlinkGraph();
hlg.fill(fulltext.getDefaultConnector(), url.getHost(), 300000, 10000000);
hlg.fill(fulltext.getDefaultConnector(), url.getHost(), null, 300000, 10000000);
hlg.findLinkDepth();
hyperlinkGraphCache.put(url.getHost(), hlg);
}

@ -36,6 +36,9 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReportCache;
import org.apache.solr.common.SolrDocument;
@ -59,7 +62,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
this.hostname = null;
}
public void fill(final SolrConnector solrConnector, String hostname, final int maxtime, final int maxnodes) {
public void fill(final SolrConnector solrConnector, String hostname, final DigestURL stopURL, final int maxtime, final int maxnodes) {
this.hostname = hostname;
if (hostname.startsWith("www.")) hostname = hostname.substring(4);
StringBuilder q = new StringBuilder();
@ -80,7 +83,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
retrieval: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
@ -97,6 +100,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (MalformedURLException e) {}
}
links = URIMetadataNode.getLinks(doc, false); // outbound
@ -106,11 +110,12 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (MalformedURLException e) {}
}
}
if (inboundEdges.size() + outboundEdges.size() > maxnodes) {
break;
break retrieval;
}
}
} catch (InterruptedException e) {
@ -144,6 +149,16 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
this.edges.putAll(errorEdges);
}
public void path(final Segment segment, ReferenceReportCache rrc, DigestURL from, DigestURL to, final int maxtime, final int maxnodes) {
// two steps to find the graph: (1) create a HyperlinkGraph (to-down) and (2) backtrack backlinks up to an element of the graph (bottom-up)
if (this.edges.size() == 0) {
fill(segment.fulltext().getDefaultConnector(), from == null ? to.getHost() : from.getHost(), to, maxtime, maxnodes);
}
if (getDepth(to) >= 0 && (from == null || getDepth(from) >= 0)) return; // nothing to do.
// now find the link bottom-up
}
public int findLinkDepth() {
int remaining = this.edges.size();

Loading…
Cancel
Save