From a29a11e5263a05bf956e7fc5d8df6853cfd01c1e Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 3 Apr 2009 07:59:49 +0000 Subject: [PATCH] added evaluation of incoming links in webstructure api the api hash changed, new XML schema. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5774 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/WebStructurePicture_p.java | 4 +- htroot/api/webstructure.java | 49 ++++++++++++------- htroot/api/webstructure.xml | 21 ++++++-- .../de/anomic/plasma/plasmaWebStructure.java | 45 +++++++++++++++-- 4 files changed, 90 insertions(+), 29 deletions(-) diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 5425f721c..1d7756677 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -93,7 +93,7 @@ public class WebStructurePicture_p { try { hash = (new yacyURL("http://" + host, null)).hash().substring(6); } catch (final MalformedURLException e) {e.printStackTrace();} - assert (sb.webStructure.references(hash) != null); + assert (sb.webStructure.outgoingReferences(hash) != null); // recursively find domains, up to a specific depth final ymageGraph graph = new ymageGraph(); @@ -125,7 +125,7 @@ public class WebStructurePicture_p { if (nextlayer == maxlayer) return mynodes; nextlayer++; final double radius = 1.0 / (1 << nextlayer); - plasmaWebStructure.structureEntry sr = structure.references(centerhash); + plasmaWebStructure.structureEntry sr = structure.outgoingReferences(centerhash); final Map next = (sr == null) ? new HashMap() : sr.references; Map.Entry entry; String targethash, targethost; diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index ee13371c1..ff2cba0d6 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -40,7 +40,8 @@ public class webstructure { final plasmaSwitchboard sb = (plasmaSwitchboard) env; final boolean latest = ((post == null) ? false : post.containsKey("latest")); String about = ((post == null) ? null : post.get("about", null)); - + prop.put("out", 0); + prop.put("in", 0); if (about != null) { yacyURL url = null; if (about.length() > 6) { @@ -52,16 +53,24 @@ public class webstructure { } } if (url != null && about != null) { - - plasmaWebStructure.structureEntry sentry = sb.webStructure.references(about); + plasmaWebStructure.structureEntry sentry = sb.webStructure.outgoingReferences(about); + if (sentry != null) { + reference(prop, "out", 0, sentry, sb.webStructure); + prop.put("out_domains", 1); + prop.put("out", 1); + } else { + prop.put("out_domains", 0); + prop.put("out", 1); + } + sentry = sb.webStructure.incomingReferences(about); if (sentry != null) { - reference(prop, 0, sentry, sb.webStructure); - prop.put("domains", 1); + reference(prop, "in", 0, sentry, sb.webStructure); + prop.put("in_domains", 1); + prop.put("in", 1); } else { - prop.put("domains", 0); + prop.put("in_domains", 0); + prop.put("in", 1); } - } else { - prop.put("domains", 0); } } else { final Iterator i = sb.webStructure.structureEntryIterator(latest); @@ -69,22 +78,24 @@ public class webstructure { plasmaWebStructure.structureEntry sentry; while (i.hasNext()) { sentry = i.next(); - reference(prop, c, sentry, sb.webStructure); + reference(prop, "out", c, sentry, sb.webStructure); c++; } - prop.put("domains", c); + prop.put("out_domains", c); + prop.put("out", 1); if (latest) sb.webStructure.joinOldNew(); } - prop.put("maxref", plasmaWebStructure.maxref); + prop.put("out_maxref", plasmaWebStructure.maxref); + prop.put("maxhosts", plasmaWebStructure.maxhosts); // return rewrite properties return prop; } - public static void reference(serverObjects prop, int c, plasmaWebStructure.structureEntry sentry, plasmaWebStructure ws) { - prop.put("domains_" + c + "_hash", sentry.domhash); - prop.put("domains_" + c + "_domain", sentry.domain); - prop.put("domains_" + c + "_date", sentry.date); + public static void reference(serverObjects prop, String prefix, int c, plasmaWebStructure.structureEntry sentry, plasmaWebStructure ws) { + prop.put(prefix + "_domains_" + c + "_hash", sentry.domhash); + prop.put(prefix + "_domains_" + c + "_domain", sentry.domain); + prop.put(prefix + "_domains_" + c + "_date", sentry.date); Iterator> k = sentry.references.entrySet().iterator(); Map.Entry refentry; String refdom, refhash; @@ -95,12 +106,12 @@ public class webstructure { refhash = refentry.getKey(); refdom = ws.resolveDomHash2DomString(refhash); if (refdom == null) continue refloop; - prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash); - prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom); + prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refhash", refhash); + prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refdom", refdom); refcount = refentry.getValue(); - prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue()); + prop.put(prefix + "_domains_" + c + "_citations_" + d + "_refcount", refcount.intValue()); d++; } - prop.put("domains_" + c + "_citations", d); + prop.put(prefix + "_domains_" + c + "_citations", d); } } diff --git a/htroot/api/webstructure.xml b/htroot/api/webstructure.xml index 05f1237a3..578126e1e 100644 --- a/htroot/api/webstructure.xml +++ b/htroot/api/webstructure.xml @@ -1,12 +1,25 @@ - - + +#(out)#:: + #{domains}# #{citations}# - + #[refdom]# #{/citations}# #{/domains}# - + +#(/out)# +#(in)#:: + +#{domains}# + +#{citations}# + #[refdom]# +#{/citations}# + +#{/domains}# + +#(/in)# \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaWebStructure.java b/source/de/anomic/plasma/plasmaWebStructure.java index edf1ece95..accab3584 100644 --- a/source/de/anomic/plasma/plasmaWebStructure.java +++ b/source/de/anomic/plasma/plasmaWebStructure.java @@ -49,7 +49,7 @@ public class plasmaWebStructure { public static int maxCRLDump = 500000; public static int maxCRGDump = 200000; public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia) - public static int maxhosts = 8000; // maximum number of hosts in web structure map + public static int maxhosts = 20000; // maximum number of hosts in web structure map private StringBuilder crg; // global citation references private final Log log; @@ -221,7 +221,7 @@ public class plasmaWebStructure { return s.toString(); } - public structureEntry references(final String domhash) { + public structureEntry outgoingReferences(final String domhash) { // returns a map with a domhash(String):refcount(Integer) relation assert domhash.length() == 6; SortedMap tailMap; @@ -258,6 +258,39 @@ public class plasmaWebStructure { return new structureEntry(domhash, domain, date, h); } + public structureEntry incomingReferences(final String domhash) { + // collect the references + final Iterator i = structureEntryIterator(false); + plasmaWebStructure.structureEntry sentry; + HashMap domhashes = new HashMap(); + while (i.hasNext()) { + sentry = i.next(); + if (sentry.references.containsKey(domhash)) { + domhashes.put(sentry.domhash, sentry.references.get(domhash)); + } + } + // construct a new structureEntry Object + return new structureEntry( + domhash, + resolveDomHash2DomString(domhash), + DateFormatter.formatShortDay(new Date()), + domhashes); + } + + public HashMap incomingDomains(final String domhash) { + // collect the references + final Iterator i = structureEntryIterator(false); + plasmaWebStructure.structureEntry sentry; + HashMap domains = new HashMap(); + while (i.hasNext()) { + sentry = i.next(); + if (sentry.references.containsKey(domhash)) { + domains.put(sentry.domain, sentry.references.get(domhash)); + } + } + return domains; + } + public int referencesCount(final String domhash) { // returns the number of domains that are referenced by this domhash assert domhash.length() == 6 : "domhash = " + domhash; @@ -313,7 +346,7 @@ public class plasmaWebStructure { final String domhash = url.hash().substring(6); // parse the new reference string and join it with the stored references - structureEntry structure = references(domhash); + structureEntry structure = outgoingReferences(domhash); final Map refs = (structure == null) ? new HashMap() : structure.references; assert reference.length() % 12 == 0; String dom; @@ -462,7 +495,11 @@ public class plasmaWebStructure { public static class structureEntry { public String domhash, domain, date; public Map references; - public structureEntry(final String domhash, final String domain, final String date, final Map references) { + public structureEntry( + final String domhash, + final String domain, + final String date, + final Map references) { this.domhash = domhash; this.domain = domain; this.date = date;