From bd409fb7ba9dc88528763035029b234135017746 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 1 Apr 2009 14:53:23 +0000 Subject: [PATCH] added web structure analysis for a special domain that can be requested from the api. Example: http://localhost:8080/api/webstructure.xml?about=www.yacy.net returns a xml with the following content: git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5766 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/WebStructurePicture_p.java | 4 +- htroot/api/webstructure.java | 86 +++++++++++++------ .../de/anomic/plasma/plasmaWebStructure.java | 22 +++-- 3 files changed, 78 insertions(+), 34 deletions(-) diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 319e0bcc5..5425f721c 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -27,6 +27,7 @@ import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -124,7 +125,8 @@ public class WebStructurePicture_p { if (nextlayer == maxlayer) return mynodes; nextlayer++; final double radius = 1.0 / (1 << nextlayer); - final Map next = structure.references(centerhash); + plasmaWebStructure.structureEntry sr = structure.references(centerhash); + final Map next = (sr == null) ? new HashMap() : sr.references; Map.Entry entry; String targethash, targethost; // first set points to next hosts diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index a41e5a745..9ce2f7a4a 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -22,6 +22,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.net.MalformedURLException; import java.util.Iterator; import java.util.Map; @@ -30,6 +31,7 @@ import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWebStructure; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.yacy.yacyURL; public class webstructure { @@ -37,39 +39,67 @@ public class webstructure { final serverObjects prop = new serverObjects(); final plasmaSwitchboard sb = (plasmaSwitchboard) env; final boolean latest = ((post == null) ? false : post.containsKey("latest")); - final Iterator i = sb.webStructure.structureEntryIterator(latest); - int c = 0, d; - plasmaWebStructure.structureEntry sentry; - Map.Entry refentry; - String refdom, refhash; - Integer refcount; - Iterator> k; - while (i.hasNext()) { - sentry = i.next(); - prop.put("domains_" + c + "_hash", sentry.domhash); - prop.put("domains_" + c + "_domain", sentry.domain); - prop.put("domains_" + c + "_date", sentry.date); - k = sentry.references.entrySet().iterator(); - d = 0; - refloop: while (k.hasNext()) { - refentry = k.next(); - refhash = refentry.getKey(); - refdom = sb.webStructure.resolveDomHash2DomString(refhash); - if (refdom == null) continue refloop; - prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash); - prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom); - refcount = refentry.getValue(); - prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue()); - d++; + String about = ((post == null) ? null : post.get("about", null)); + + if (about != null) { + yacyURL url = null; + if (about.length() > 6) { + try { + url = new yacyURL(about, null); + about = url.hash().substring(6); + } catch (MalformedURLException e) { + about = null; + } + } + if (about != null) { + plasmaWebStructure.structureEntry sentry = sb.webStructure.references(about); + if (sentry != null) { + reference(prop, 0, sentry, sb.webStructure); + prop.put("domains", 1); + } else { + prop.put("domains", 0); + } + } else { + prop.put("domains", 0); + } + } else { + final Iterator i = sb.webStructure.structureEntryIterator(latest); + int c = 0; + plasmaWebStructure.structureEntry sentry; + while (i.hasNext()) { + sentry = i.next(); + reference(prop, c, sentry, sb.webStructure); + c++; } - prop.put("domains_" + c + "_citations", d); - c++; + prop.put("domains", c); + if (latest) sb.webStructure.joinOldNew(); } - prop.put("domains", c); prop.put("maxref", plasmaWebStructure.maxref); - if (latest) sb.webStructure.joinOldNew(); // return rewrite properties return prop; } + + public static void reference(serverObjects prop, int c, plasmaWebStructure.structureEntry sentry, plasmaWebStructure ws) { + prop.put("domains_" + c + "_hash", sentry.domhash); + prop.put("domains_" + c + "_domain", sentry.domain); + prop.put("domains_" + c + "_date", sentry.date); + Iterator> k = sentry.references.entrySet().iterator(); + Map.Entry refentry; + String refdom, refhash; + Integer refcount; + int d = 0; + refloop: while (k.hasNext()) { + refentry = k.next(); + refhash = refentry.getKey(); + refdom = ws.resolveDomHash2DomString(refhash); + if (refdom == null) continue refloop; + prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash); + prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom); + refcount = refentry.getValue(); + prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue()); + d++; + } + prop.put("domains_" + c + "_citations", d); + } } diff --git a/source/de/anomic/plasma/plasmaWebStructure.java b/source/de/anomic/plasma/plasmaWebStructure.java index a50f9eecf..67710a4cb 100644 --- a/source/de/anomic/plasma/plasmaWebStructure.java +++ b/source/de/anomic/plasma/plasmaWebStructure.java @@ -221,17 +221,23 @@ public class plasmaWebStructure { return s.toString(); } - public Map references(final String domhash) { + public structureEntry references(final String domhash) { // returns a map with a domhash(String):refcount(Integer) relation assert domhash.length() == 6; SortedMap tailMap; Map h = new HashMap(); + String domain = ""; + String date = ""; + String ref; synchronized (structure_old) { tailMap = structure_old.tailMap(domhash); if (!tailMap.isEmpty()) { final String key = tailMap.firstKey(); if (key.startsWith(domhash)) { - h = refstr2map(tailMap.get(key)); + domain = key.substring(7); + ref = tailMap.get(key); + date = ref.substring(0, 8); + h = refstr2map(ref); } } } @@ -240,11 +246,16 @@ public class plasmaWebStructure { if (!tailMap.isEmpty()) { final String key = tailMap.firstKey(); if (key.startsWith(domhash)) { - h.putAll(refstr2map(tailMap.get(key))); + ref = tailMap.get(key); + if (domain.length() == 0) domain = key.substring(7); + if (date.length() == 0) date = ref.substring(0, 8); + assert domain == key.substring(7) : "domain = " + domain + ", key = " + key; + h.putAll(refstr2map(ref)); } } } - return h; + if (h.size() == 0) return null; + return new structureEntry(domhash, domain, date, h); } public int referencesCount(final String domhash) { @@ -302,7 +313,8 @@ public class plasmaWebStructure { final String domhash = url.hash().substring(6); // parse the new reference string and join it with the stored references - final Map refs = references(domhash); + structureEntry structure = references(domhash); + final Map refs = (structure == null) ? new HashMap() : structure.references; assert reference.length() % 12 == 0; String dom; int c;