extended webstructure api to show together with incoming links also

outgoing links
pull/1/head
Michael Christen 13 years ago
parent 02e4dedff2
commit 41be98dc9d

@ -30,9 +30,12 @@ import java.util.Map;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceContainer;
@ -51,6 +54,7 @@ public class webstructure {
String about = post == null ? null : post.get("about", null); // may be a URL, a URL hash or a domain hash String about = post == null ? null : post.get("about", null); // may be a URL, a URL hash or a domain hash
prop.put("out", 0); prop.put("out", 0);
prop.put("in", 0); prop.put("in", 0);
prop.put("references", 0);
prop.put("citations", 0); prop.put("citations", 0);
boolean authenticated = sb.adminAuthenticated(header) >= 2; boolean authenticated = sb.adminAuthenticated(header) >= 2;
if (about != null) { if (about != null) {
@ -91,6 +95,49 @@ public class webstructure {
} }
} }
if (urlhash != null) { if (urlhash != null) {
// anchors
prop.put("references", 1);
net.yacy.document.Document scraper = null;
if (url != null) try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST);
} catch (final IOException e) {
Log.logException(e);
}
if (scraper != null) {
prop.put("references_count", 1);
prop.put("references_documents", 1);
prop.put("references_documents_0_hash", urlhash);
prop.put("references_documents_0_count", scraper.inboundLinkCount() + scraper.outboundLinkCount());
prop.put("references_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date()));
prop.put("references_documents_0_urle", url == null ? 0 : 1);
if (url != null) prop.put("references_documents_0_urle_url", url.toNormalform(true, false));
int d = 0;
Iterator<MultiProtocolURI> i = scraper.inboundLinks().iterator();
while (i.hasNext()) {
DigestURI refurl = new DigestURI(i.next());
byte[] refhash = refurl.hash();
prop.put("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true, false));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
prop.put("references_documents_0_anchors_" + d + "_outbound", 0);
d++;
}
i = scraper.outboundLinks().iterator();
while (i.hasNext()) {
DigestURI refurl = new DigestURI(i.next());
byte[] refhash = refurl.hash();
prop.put("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true, false));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
prop.put("references_documents_0_anchors_" + d + "_outbound", 1);
d++;
}
prop.put("references_documents_0_count", d);
prop.put("references_documents_0_anchors", d);
} else {
prop.put("references_count", 0);
prop.put("references_documents", 0);
}
// citations
prop.put("citations", 1); prop.put("citations", 1);
IndexCell<CitationReference> citationReferences = sb.indexSegments.segment(Segments.Process.PUBLIC).urlCitation(); IndexCell<CitationReference> citationReferences = sb.indexSegments.segment(Segments.Process.PUBLIC).urlCitation();
ReferenceContainer<CitationReference> citations = null; ReferenceContainer<CitationReference> citations = null;
@ -100,30 +147,30 @@ public class webstructure {
} catch (IOException e) { } catch (IOException e) {
} }
if (citations != null) { if (citations != null) {
prop.put("citations_anchors", 1); prop.put("citations_count", 1);
prop.put("citations_anchors_0_hash", urlhash); prop.put("citations_documents", 1);
prop.put("citations_anchors_0_citationscount", citations.size()); prop.put("citations_documents_0_hash", urlhash);
prop.put("citations_anchors_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(citations.lastWrote()))); prop.put("citations_documents_0_count", citations.size());
prop.put("citations_anchors_0_urle", url == null ? 0 : 1); prop.put("citations_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(citations.lastWrote())));
if (url != null) prop.put("citations_anchors_0_urle_url", url.toNormalform(true, false)); prop.put("citations_documents_0_urle", url == null ? 0 : 1);
Iterator<CitationReference> i = citations.entries(); if (url != null) prop.put("citations_documents_0_urle_url", url.toNormalform(true, false));
int d = 0; int d = 0;
CitationReference cr; Iterator<CitationReference> i = citations.entries();
byte[] refhash; while (i.hasNext()) {
DigestURI refurl; CitationReference cr = i.next();
while (i.hasNext()) { byte[] refhash = cr.urlhash();
cr = i.next(); DigestURI refurl = authenticated ? sb.getURL(Segments.Process.PUBLIC, refhash) : null;
refhash = cr.urlhash(); prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
refurl = authenticated ? sb.getURL(Segments.Process.PUBLIC, refhash) : null; if (refurl != null) prop.put("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true, false));
prop.put("citations_anchors_0_citations_" + d + "_refurle", refurl == null ? 0 : 1); prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
if (refurl != null) prop.put("citations_anchors_0_citations_" + d + "_refurle_refurl", refurl.toNormalform(true, false)); prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(cr.lastModified())));
prop.put("citations_anchors_0_citations_" + d + "_refurle_refhash", refhash);
prop.put("citations_anchors_0_citations_" + d + "_refurle_refdate", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(cr.lastModified())));
d++; d++;
} }
prop.put("citations_anchors_0_citations", d); prop.put("citations_documents_0_count", d);
prop.put("citations_documents_0_anchors", d);
} else { } else {
prop.put("citations_anchors", 0); prop.put("citations_count", 0);
prop.put("citations_documents", 0);
} }
} }
} else if (authenticated) { } else if (authenticated) {
@ -151,7 +198,7 @@ public class webstructure {
// return rewrite properties // return rewrite properties
return prop; return prop;
} }
public static void reference(serverObjects prop, String prefix, int c, WebStructureGraph.StructureEntry sentry, WebStructureGraph ws) { public static void reference(serverObjects prop, String prefix, int c, WebStructureGraph.StructureEntry sentry, WebStructureGraph ws) {
prop.put(prefix + "_domains_" + c + "_hash", sentry.hosthash); prop.put(prefix + "_domains_" + c + "_hash", sentry.hosthash);
prop.put(prefix + "_domains_" + c + "_domain", sentry.hostname); prop.put(prefix + "_domains_" + c + "_domain", sentry.hostname);

@ -1,6 +1,8 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<webstructure maxhosts="#[maxhosts]#"> <webstructure maxhosts="#[maxhosts]#">
#(out)#:: #(out)#::
<!-- accumulated list of outgoing links to other domains (per host accumulated anchors)-->
<references direction="out" count="#[domains]#" maxref="#[maxref]#"> <references direction="out" count="#[domains]#" maxref="#[maxref]#">
#{domains}# #{domains}#
<domain host="#[domain]#" id="#[hash]#" date="#[date]#"> <domain host="#[domain]#" id="#[hash]#" date="#[date]#">
@ -11,7 +13,9 @@
#{/domains}# #{/domains}#
</references> </references>
#(/out)# #(/out)#
#(in)#:: #(in)#::
<!-- accumulated list of incoming links from other domains (per host accumulated references)-->
<references direction="in" count="#[domains]#"> <references direction="in" count="#[domains]#">
#{domains}# #{domains}#
<domain host="#[domain]#" id="#[hash]#" date="#[date]#"> <domain host="#[domain]#" id="#[hash]#" date="#[date]#">
@ -22,16 +26,33 @@
#{/domains}# #{/domains}#
</references> </references>
#(/in)# #(/in)#
#(citations)#::
<citations count="#[anchors]#"> #(references)#::
<!-- detailed list of outgoing links (anchors) from documents to references -->
<anchors direction="out" count="#[count]#">
#{documents}#
<document id="#[hash]#" date="#[date]#" count="#[count]#">
#(urle)#::<url>#[url]#</url>#(/urle)#
#{anchors}# #{anchors}#
<anchor id="#[hash]#" date="#[date]#" count="#[citationscount]#"> <anchor id="#[hash]#" type="#(outbound)#inbound::outbound#(/outbound)#">#[url]#</anchor>
#{/anchors}#
</document>
#{/documents}#
</anchors>
#(/references)#
#(citations)#::
<!-- detailed list of incoming links (citations) from other documents (their references) - reverse link structure -->
<anchors direction="in" count="#[count]#">
#{documents}#
<document id="#[hash]#" date="#[date]#" count="#[count]#">
#(urle)#::<url>#[url]#</url>#(/urle)# #(urle)#::<url>#[url]#</url>#(/urle)#
#{citations}# #{anchors}#
#(refurle)#<citation id="#[refhash]#" date="#[refdate]#"/>::<citation id="#[refhash]#" date="#[refdate]#">#[refurl]#</citation>#(/refurle)# #(urle)#<anchor id="#[hash]#" date="#[date]#"/>::<anchor id="#[hash]#" date="#[date]#">#[url]#</anchor>#(/urle)#
#{/citations}#
</anchor>
#{/anchors}# #{/anchors}#
</citations> </document>
#{/documents}#
</anchors>
#(/citations)# #(/citations)#
</webstructure> </webstructure>
Loading…
Cancel
Save