added evaluation of incoming links in webstructure api

the api hash changed, new XML schema.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5774 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent f6691411b5
commit a29a11e526

@ -93,7 +93,7 @@ public class WebStructurePicture_p {
try {
hash = (new yacyURL("http://" + host, null)).hash().substring(6);
} catch (final MalformedURLException e) {e.printStackTrace();}
assert (sb.webStructure.references(hash) != null);
assert (sb.webStructure.outgoingReferences(hash) != null);
// recursively find domains, up to a specific depth
final ymageGraph graph = new ymageGraph();
@ -125,7 +125,7 @@ public class WebStructurePicture_p {
if (nextlayer == maxlayer) return mynodes;
nextlayer++;
final double radius = 1.0 / (1 << nextlayer);
plasmaWebStructure.structureEntry sr = structure.references(centerhash);
plasmaWebStructure.structureEntry sr = structure.outgoingReferences(centerhash);
final Map<String, Integer> next = (sr == null) ? new HashMap<String, Integer>() : sr.references;
Map.Entry<String, Integer> entry;
String targethash, targethost;

@ -40,7 +40,8 @@ public class webstructure {
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
String about = ((post == null) ? null : post.get("about", null));
prop.put("out", 0);
prop.put("in", 0);
if (about != null) {
yacyURL url = null;
if (about.length() > 6) {
@ -52,16 +53,24 @@ public class webstructure {
}
}
if (url != null && about != null) {
plasmaWebStructure.structureEntry sentry = sb.webStructure.references(about);
plasmaWebStructure.structureEntry sentry = sb.webStructure.outgoingReferences(about);
if (sentry != null) {
reference(prop, 0, sentry, sb.webStructure);
prop.put("domains", 1);
reference(prop, "out", 0, sentry, sb.webStructure);
prop.put("out_domains", 1);
prop.put("out", 1);
} else {
prop.put("domains", 0);
prop.put("out_domains", 0);
prop.put("out", 1);
}
sentry = sb.webStructure.incomingReferences(about);
if (sentry != null) {
reference(prop, "in", 0, sentry, sb.webStructure);
prop.put("in_domains", 1);
prop.put("in", 1);
} else {
prop.put("domains", 0);
prop.put("in_domains", 0);
prop.put("in", 1);
}
}
} else {
final Iterator<plasmaWebStructure.structureEntry> i = sb.webStructure.structureEntryIterator(latest);
@ -69,22 +78,24 @@ public class webstructure {
plasmaWebStructure.structureEntry sentry;
while (i.hasNext()) {
sentry = i.next();
reference(prop, c, sentry, sb.webStructure);
reference(prop, "out", c, sentry, sb.webStructure);
c++;
}
prop.put("domains", c);
prop.put("out_domains", c);
prop.put("out", 1);
if (latest) sb.webStructure.joinOldNew();
}
prop.put("maxref", plasmaWebStructure.maxref);
prop.put("out_maxref", plasmaWebStructure.maxref);
prop.put("maxhosts", plasmaWebStructure.maxhosts);
// return rewrite properties
return prop;
}
public static void reference(serverObjects prop, int c, plasmaWebStructure.structureEntry sentry, plasmaWebStructure ws) {
prop.put("domains_" + c + "_hash", sentry.domhash);
prop.put("domains_" + c + "_domain", sentry.domain);
prop.put("domains_" + c + "_date", sentry.date);
public static void reference(serverObjects prop, String prefix, int c, plasmaWebStructure.structureEntry sentry, plasmaWebStructure ws) {
prop.put(prefix + "_domains_" + c + "_hash", sentry.domhash);
prop.put(prefix + "_domains_" + c + "_domain", sentry.domain);
prop.put(prefix + "_domains_" + c + "_date", sentry.date);
Iterator<Map.Entry<String, Integer>> k = sentry.references.entrySet().iterator();
Map.Entry<String, Integer> refentry;
String refdom, refhash;
@ -95,12 +106,12 @@ public class webstructure {
refhash = refentry.getKey();
refdom = ws.resolveDomHash2DomString(refhash);
if (refdom == null) continue refloop;
prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom);
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refdom", refdom);
refcount = refentry.getValue();
prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
d++;
}
prop.put("domains_" + c + "_citations", d);
prop.put(prefix + "_domains_" + c + "_citations", d);
}
}

@ -1,12 +1,25 @@
<?xml version="1.0"?>
<webstructure>
<domains reference="reverse" count="#[domains]#" maxref="#[maxref]#">
<webstructure maxhosts="#[maxhosts]#">
#(out)#::
<references direction="out" count="#[domains]#" maxref="#[maxref]#">
#{domains}#
<domain host="#[domain]#" id="#[hash]#" date="#[date]#">
#{citations}#
<citation host="#[refdom]#" id="#[refhash]#" count="#[refcount]#" />
<reference id="#[refhash]#" count="#[refcount]#">#[refdom]#</reference>
#{/citations}#
</domain>
#{/domains}#
</domains>
</references>
#(/out)#
#(in)#::
<references direction="in" count="#[domains]#">
#{domains}#
<domain host="#[domain]#" id="#[hash]#" date="#[date]#">
#{citations}#
<reference id="#[refhash]#" count="#[refcount]#">#[refdom]#</reference>
#{/citations}#
</domain>
#{/domains}#
</references>
#(/in)#
</webstructure>

@ -49,7 +49,7 @@ public class plasmaWebStructure {
public static int maxCRLDump = 500000;
public static int maxCRGDump = 200000;
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 8000; // maximum number of hosts in web structure map
public static int maxhosts = 20000; // maximum number of hosts in web structure map
private StringBuilder crg; // global citation references
private final Log log;
@ -221,7 +221,7 @@ public class plasmaWebStructure {
return s.toString();
}
public structureEntry references(final String domhash) {
public structureEntry outgoingReferences(final String domhash) {
// returns a map with a domhash(String):refcount(Integer) relation
assert domhash.length() == 6;
SortedMap<String, String> tailMap;
@ -258,6 +258,39 @@ public class plasmaWebStructure {
return new structureEntry(domhash, domain, date, h);
}
public structureEntry incomingReferences(final String domhash) {
// collect the references
final Iterator<plasmaWebStructure.structureEntry> i = structureEntryIterator(false);
plasmaWebStructure.structureEntry sentry;
HashMap<String, Integer> domhashes = new HashMap<String, Integer>();
while (i.hasNext()) {
sentry = i.next();
if (sentry.references.containsKey(domhash)) {
domhashes.put(sentry.domhash, sentry.references.get(domhash));
}
}
// construct a new structureEntry Object
return new structureEntry(
domhash,
resolveDomHash2DomString(domhash),
DateFormatter.formatShortDay(new Date()),
domhashes);
}
public HashMap<String, Integer> incomingDomains(final String domhash) {
// collect the references
final Iterator<plasmaWebStructure.structureEntry> i = structureEntryIterator(false);
plasmaWebStructure.structureEntry sentry;
HashMap<String, Integer> domains = new HashMap<String, Integer>();
while (i.hasNext()) {
sentry = i.next();
if (sentry.references.containsKey(domhash)) {
domains.put(sentry.domain, sentry.references.get(domhash));
}
}
return domains;
}
public int referencesCount(final String domhash) {
// returns the number of domains that are referenced by this domhash
assert domhash.length() == 6 : "domhash = " + domhash;
@ -313,7 +346,7 @@ public class plasmaWebStructure {
final String domhash = url.hash().substring(6);
// parse the new reference string and join it with the stored references
structureEntry structure = references(domhash);
structureEntry structure = outgoingReferences(domhash);
final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0;
String dom;
@ -462,7 +495,11 @@ public class plasmaWebStructure {
public static class structureEntry {
public String domhash, domain, date;
public Map<String, Integer> references;
public structureEntry(final String domhash, final String domain, final String date, final Map<String, Integer> references) {
public structureEntry(
final String domhash,
final String domain,
final String date,
final Map<String, Integer> references) {
this.domhash = domhash;
this.domain = domain;
this.date = date;

Loading…
Cancel
Save