introduction of a data structure for HyperlinkEdges which should use

less memory as it does no double-storage of source links for each edge
of the graph.
pull/1/head
Michael Peter Christen 11 years ago
parent 6ea8bb7348
commit dd12dd392f

@ -48,8 +48,8 @@ public class linkstructure {
Fulltext fulltext = sb.index.fulltext();
if (post == null) return prop;
boolean authenticated = sb.adminAuthenticated(header) >= 2;
int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 300000 : 1000);
int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 10000000 : 100);
int maxtime = Math.min(post.getInt("maxtime", 60000), authenticated ? 300000 : 1000);
int maxnodes = Math.min(post.getInt("maxnodes", 10000), authenticated ? 10000000 : 100);
HyperlinkGraph hlg = new HyperlinkGraph();
int maxdepth = 0;
@ -97,8 +97,8 @@ public class linkstructure {
int c = 0;
for (HyperlinkEdge e: hlg) {
prop.putJSON("edges_" + c + "_source", e.source.getPath());
prop.putJSON("edges_" + c + "_target", e.type.equals(HyperlinkType.Outbound) ? e.target.toNormalform(true) : e.target.getPath());
prop.putJSON("edges_" + c + "_type", e.type.name());
prop.putJSON("edges_" + c + "_target", e.target.type.equals(HyperlinkType.Outbound) ? e.target.toNormalform(true) : e.target.getPath());
prop.putJSON("edges_" + c + "_type", e.target.type.name());
Integer depth_source = hlg.getDepth(e.source);
Integer depth_target = hlg.getDepth(e.target);
prop.put("edges_" + c + "_depthSource", depth_source == null ? -1 : depth_source.intValue());

@ -20,17 +20,33 @@
package net.yacy.search.schema;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.MultiProtocolURL;
public class HyperlinkEdge {
public MultiProtocolURL source, target;
public static class Target extends MultiProtocolURL {
private static final long serialVersionUID = 5746600160371492930L;
public HyperlinkType type;
public HyperlinkEdge(MultiProtocolURL source, MultiProtocolURL target, HyperlinkType type) {
public Target(final String url, final HyperlinkType type) throws MalformedURLException {
super(url);
this.type = type;
}
public Target(final MultiProtocolURL url, final HyperlinkType type) {
super(url);
this.type = type;
}
}
public MultiProtocolURL source;
public Target target;
public HyperlinkEdge(MultiProtocolURL source, Target target) {
this.source = source;
this.target = target;
this.type = type;
}
@Override
@ -40,7 +56,7 @@ public class HyperlinkEdge {
sb.append(" -> ");
sb.append(this.target.toNormalform(true));
sb.append(" (");
sb.append(type.name());
sb.append(this.target.type.name());
sb.append(")");
return sb.toString();
}

@ -0,0 +1,114 @@
package net.yacy.search.schema;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.id.MultiProtocolURL;
public class HyperlinkEdges implements Iterable<HyperlinkEdge> {
public static class Targets {
public Set<HyperlinkEdge.Target> targets;
public int depth;
public Targets(int depth) {
this.targets = new LinkedHashSet<HyperlinkEdge.Target>();
this.depth = depth;
}
}
private final Map<MultiProtocolURL, Targets> edges;
private final Map<MultiProtocolURL, Integer> singletonDepth;
public HyperlinkEdges() {
this.edges = new LinkedHashMap<MultiProtocolURL, Targets>();
this.singletonDepth = new HashMap<MultiProtocolURL, Integer>();
}
public void add(final HyperlinkEdge edge) {
addEdge(edge.source, edge.target);
}
public void addEdge(final MultiProtocolURL source, final HyperlinkEdge.Target target) {
Targets targets = this.edges.get(source);
Integer d = this.singletonDepth.get(source);
if (d == null) d = -1; else this.singletonDepth.remove(source);
if (targets == null) {
targets = new Targets(d.intValue());
this.edges.put(source, targets);
}
targets.targets.add(target);
}
public int size() {
int s = 0;
for (Targets t: edges.values()) s += t.targets.size();
return s;
}
public void addAll(final HyperlinkEdges oe) {
for (Map.Entry<MultiProtocolURL, Targets> edges: oe.edges.entrySet()) {
for (HyperlinkEdge.Target t: edges.getValue().targets) {
this.addEdge(edges.getKey(), t);
}
}
}
public void updateDepth(final MultiProtocolURL url, final int newdepth) {
Targets targets = this.edges.get(url);
if (targets == null) {
singletonDepth.put(url, newdepth);
return;
}
if (targets.depth == -1) {
targets.depth = newdepth;
} else {
targets.depth = Math.min(targets.depth, newdepth);
}
}
public Integer getDepth(final MultiProtocolURL url) {
Targets targets = this.edges.get(url);
if (targets != null) return targets.depth;
Integer d = this.singletonDepth.get(url);
return d == null ? -1 : d.intValue();
}
@Override
public Iterator<HyperlinkEdge> iterator() {
final Iterator<Map.Entry<MultiProtocolURL, Targets>> i = this.edges.entrySet().iterator();
@SuppressWarnings("unchecked")
final Iterator<HyperlinkEdge.Target>[] tc = new Iterator[1];
tc[0] = null;
final MultiProtocolURL[] su = new MultiProtocolURL[1];
su[0] = null;
return new Iterator<HyperlinkEdge>() {
@Override
public boolean hasNext() {
return i.hasNext() || (tc[0] != null && tc[0].hasNext());
}
@Override
public HyperlinkEdge next() {
while (tc[0] == null || !tc[0].hasNext()) {
Map.Entry<MultiProtocolURL, Targets> entry = i.next();
tc[0] = entry.getValue().targets.iterator();
su[0] = entry.getKey();
}
if (!tc[0].hasNext()) return null;
return new HyperlinkEdge(su[0], tc[0].next());
}
@Override
public void remove() {
tc[0].remove();
}
};
}
}

@ -24,7 +24,6 @@ import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
@ -52,13 +51,11 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
}
}
Map<String, HyperlinkEdge> edges;
Map<MultiProtocolURL, Integer> depths;
HyperlinkEdges edges;
String hostname;
public HyperlinkGraph() {
this.edges = new LinkedHashMap<String, HyperlinkEdge>();
this.depths = new HashMap<MultiProtocolURL, Integer>();
this.edges = new HyperlinkEdges();
this.hostname = null;
}
@ -79,14 +76,14 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
);
SolrDocument doc;
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
HyperlinkEdges inboundEdges = new HyperlinkEdges();
HyperlinkEdges outboundEdges = new HyperlinkEdges();
HyperlinkEdges errorEdges = new HyperlinkEdges();
try {
retrieval: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
MultiProtocolURL from = new MultiProtocolURL(u);
String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
if (error != null) {
@ -97,9 +94,9 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
while (links.hasNext()) {
link = links.next();
try {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkType.Inbound));
HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Inbound);
String edgehash = ids + ASCII.String(new DigestURL(link, null).hash());
inboundEdges.addEdge(from, linkurl);
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (MalformedURLException e) {}
}
@ -107,9 +104,9 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
while (links.hasNext()) {
link = links.next();
try {
DigestURL linkurl = new DigestURL(link, null);
String edgehash = ids + ASCII.String(linkurl.hash());
outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkType.Outbound));
HyperlinkEdge.Target linkurl = new HyperlinkEdge.Target(link, HyperlinkType.Outbound);
String edgehash = ids + ASCII.String(new DigestURL(link, null).hash());
outboundEdges.addEdge(from, linkurl);
if (stopURL != null && linkurl.equals(stopURL)) break retrieval;
} catch (MalformedURLException e) {}
}
@ -122,31 +119,31 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
} catch (MalformedURLException e) {
}
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();
Map.Entry<String, HyperlinkEdge> edge;
Iterator<HyperlinkEdge> i = inboundEdges.iterator();
HyperlinkEdge edge;
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.getValue().type = HyperlinkType.Dead;
errorEdges.put(edge.getKey(), edge.getValue());
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
i = outboundEdges.entrySet().iterator();
i = outboundEdges.iterator();
while (i.hasNext()) {
edge = i.next();
if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
if (errorDocs.containsKey(edge.target.toNormalform(true))) {
i.remove();
edge.getValue().type = HyperlinkType.Dead;
errorEdges.put(edge.getKey(), edge.getValue());
edge.target.type = HyperlinkType.Dead;
errorEdges.add(edge);
}
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
this.edges.putAll(outboundEdges);
this.edges.putAll(inboundEdges);
this.edges.putAll(errorEdges);
this.edges.addAll(outboundEdges);
this.edges.addAll(inboundEdges);
this.edges.addAll(errorEdges);
}
public void path(final Segment segment, ReferenceReportCache rrc, DigestURL from, DigestURL to, final int maxtime, final int maxnodes) {
@ -166,11 +163,11 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
// first find root nodes
Set<MultiProtocolURL> nodes = new HashSet<MultiProtocolURL>();
Set<MultiProtocolURL> nextnodes = new HashSet<MultiProtocolURL>();
for (HyperlinkEdge edge: this.edges.values()) {
for (HyperlinkEdge edge: this.edges) {
String path = edge.source.getPath();
if (ROOTFNS.contains(path)) {
if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, 0);
if (edge.type == HyperlinkType.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, 1);
this.edges.updateDepth(edge.source, 0);
if (edge.target.type == HyperlinkType.Inbound) this.edges.updateDepth(edge.target, 1);
nodes.add(edge.source);
nextnodes.add(edge.target);
remaining--;
@ -192,10 +189,10 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
boolean found = false;
nodes = nextnodes;
nextnodes = new HashSet<MultiProtocolURL>();
for (HyperlinkEdge edge: this.edges.values()) {
for (HyperlinkEdge edge: this.edges) {
if (nodes.contains(edge.source)) {
if (!this.depths.containsKey(edge.source)) this.depths.put(edge.source, depth);
if (edge.type == HyperlinkType.Inbound && !this.depths.containsKey(edge.target)) this.depths.put(edge.target, depth + 1);
this.edges.updateDepth(edge.source, depth);
if (edge.target.type == HyperlinkType.Inbound) this.edges.updateDepth(edge.target, depth + 1);
nextnodes.add(edge.target);
remaining--;
found = true;
@ -209,12 +206,12 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
}
public Integer getDepth(MultiProtocolURL url) {
return this.depths.get(url);
return this.edges.getDepth(url);
}
@Override
public Iterator<HyperlinkEdge> iterator() {
return this.edges.values().iterator();
return this.edges.iterator();
}
}

Loading…
Cancel
Save