added an api to retrieve the new citation.index with the

webstructure.xml api. This api will respond with details about a single
URL if requested with 'webstructure.xml?about=[url|urlhash|host]'.
pull/1/head
Michael Christen 13 years ago
parent 8fc86fe397
commit 71649a1296

@ -22,15 +22,23 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.peers.graphics.WebStructureGraph;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -40,40 +48,85 @@ public class webstructure {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
String about = post == null ? null : post.get("about", null);
String about = post == null ? null : post.get("about", null); // may be a URL, a URL hash or a domain hash
prop.put("out", 0);
prop.put("in", 0);
prop.put("citations", 0);
boolean authenticated = sb.adminAuthenticated(header) >= 2;
if (about != null) {
DigestURI url = null;
if (about.length() > 6) {
byte[] urlhash = null;
String hosthash = null;
if (about.length() == 6 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
hosthash = about;
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
urlhash = ASCII.getBytes(about);
hosthash = about.substring(6);
url = authenticated ? sb.getURL(Segments.Process.PUBLIC, urlhash) : null;
} else if (authenticated && about.length() > 0) {
// consider "about" as url or hostname
try {
url = new DigestURI(about);
about = ASCII.String(url.hash(), 6, 6);
url = new DigestURI(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
urlhash = url.hash();
hosthash = ASCII.String(urlhash, 6, 6);
} catch (MalformedURLException e) {
about = null;
}
}
if (url != null && about != null) {
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(about);
if (hosthash != null) {
prop.put("out", 1);
prop.put("in", 1);
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash);
if (sentry != null) {
reference(prop, "out", 0, sentry, sb.webStructure);
prop.put("out_domains", 1);
prop.put("out", 1);
} else {
prop.put("out_domains", 0);
prop.put("out", 1);
}
sentry = sb.webStructure.incomingReferences(about);
sentry = sb.webStructure.incomingReferences(hosthash);
if (sentry != null) {
reference(prop, "in", 0, sentry, sb.webStructure);
prop.put("in_domains", 1);
prop.put("in", 1);
} else {
prop.put("in_domains", 0);
prop.put("in", 1);
}
}
} else if (sb.adminAuthenticated(header) >= 2) {
if (urlhash != null) {
prop.put("citations", 1);
IndexCell<CitationReference> citationReferences = sb.indexSegments.segment(Segments.Process.PUBLIC).urlCitation();
ReferenceContainer<CitationReference> citations = null;
// citationReferences.count(urlhash) would give to the number of references good for ranking
try {
citations = citationReferences.get(urlhash, null);
} catch (IOException e) {
}
if (citations != null) {
prop.put("citations_anchors", 1);
prop.put("citations_anchors_0_hash", urlhash);
prop.put("citations_anchors_0_citationscount", citations.size());
prop.put("citations_anchors_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(citations.lastWrote())));
prop.put("citations_anchors_0_urle", url == null ? 0 : 1);
if (url != null) prop.put("citations_anchors_0_urle_url", url.toNormalform(true, false));
Iterator<CitationReference> i = citations.entries();
int d = 0;
CitationReference cr;
byte[] refhash;
DigestURI refurl;
while (i.hasNext()) {
cr = i.next();
refhash = cr.urlhash();
refurl = authenticated ? sb.getURL(Segments.Process.PUBLIC, refhash) : null;
prop.put("citations_anchors_0_citations_" + d + "_refurle", refurl == null ? 0 : 1);
if (refurl != null) prop.put("citations_anchors_0_citations_" + d + "_refurle_refurl", refurl.toNormalform(true, false));
prop.put("citations_anchors_0_citations_" + d + "_refurle_refhash", refhash);
prop.put("citations_anchors_0_citations_" + d + "_refurle_refdate", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(cr.lastModified())));
d++;
}
prop.put("citations_anchors_0_citations", d);
} else {
prop.put("citations_anchors", 0);
}
}
} else if (authenticated) {
// show a complete list of link structure informations in case that the user is authenticated
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
final Iterator<WebStructureGraph.StructureEntry> i = sb.webStructure.structureEntryIterator(latest);

@ -22,4 +22,16 @@
#{/domains}#
</references>
#(/in)#
#(citations)#::
<citations count="#[anchors]#">
#{anchors}#
<anchor id="#[hash]#" date="#[date]#" count="#[citationscount]#">
#(urle)#::<url>#[url]#</url>#(/urle)#
#{citations}#
#(refurle)#<citation id="#[refhash]#" date="#[refdate]#"/>::<citation id="#[refhash]#" date="#[refdate]#">#[refurl]#</citation>#(/refurle)#
#{/citations}#
</anchor>
#{/anchors}#
</citations>
#(/citations)#
</webstructure>
Loading…
Cancel
Save