yacy_search_server/htroot/api/linkstructure.java

// linkstructure.java
// ------------
// (C) 2014 by Michael Peter Christen; mc@yacy.net
// first published 02.04.2014 on http://yacy.net
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.BlockingQueue;

import org.apache.solr.common.SolrDocument;

import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.HyperlinkEdge;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.server.servletProperties;

public class linkstructure {
    
    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
        final servletProperties prop = new servletProperties();
        
        final String ext = header.get("EXT", "");
        //final boolean json = ext.equals("json");
        final boolean xml = ext.equals("xml");
        
        final Switchboard sb = (Switchboard) env;
        Fulltext fulltext = sb.index.fulltext();
        if (post == null) return prop;
        String about = post.get("about", null); // may be a URL, a URL hash or a domain hash
        if (about == null) return prop;
        boolean authenticated = sb.adminAuthenticated(header) >= 2;
        int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 60000 : 1000);
        int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 1000 : 100);

        DigestURL url = null;
        String hostname = null;
        if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
            byte[] urlhash = ASCII.getBytes(about);
            url = authenticated ? sb.getURL(urlhash) : null;
        } else if (url == null && about.length() > 0) {
            // consider "about" as url or hostname
            try {
                url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
                hostname = url.getHost();
                if (hostname.startsWith("www.")) hostname = hostname.substring(4);
            } catch (final MalformedURLException e) {
            }
        }
        if (hostname == null) return prop;
        
        // now collect _all_ documents inside the domain until a timeout appears
        StringBuilder q = new StringBuilder();
        q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);
        BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, maxnodes, maxtime, 100, 1,
                CollectionSchema.id.getSolrFieldName(),
                CollectionSchema.sku.getSolrFieldName(),
                CollectionSchema.failreason_s.getSolrFieldName(),
                CollectionSchema.failtype_s.getSolrFieldName(),
                CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
                CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
                CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
                CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()
                );
        SolrDocument doc;
        Map<String, FailType> errorDocs = new HashMap<String, FailType>();
        Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();
        Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();
        Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();
        try {
            while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
                String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
                DigestURL from = new DigestURL(u, ASCII.getBytes(ids));
                String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
                FailType error = errortype == null ? null : FailType.valueOf(errortype);
                if (error != null) {
                    errorDocs.put(u, error);
                } else {
                    Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound
                    String link;
                    while (links.hasNext()) {
                        link = links.next();
                        try {
                            DigestURL linkurl = new DigestURL(link, null);
                            String edgehash = ids + ASCII.String(linkurl.hash());
                            inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));
                        } catch (MalformedURLException e) {}
                    }
                    links = URIMetadataNode.getLinks(doc, false); // outbound
                    while (links.hasNext()) {
                        link = links.next();
                        try {
                            DigestURL linkurl = new DigestURL(link, null);
                            String edgehash = ids + ASCII.String(linkurl.hash());
                            outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));
                        } catch (MalformedURLException e) {}
                    }
                }
                if (inboundEdges.size() + outboundEdges.size() > maxnodes) break;
            }
        } catch (InterruptedException e) {
        } catch (MalformedURLException e) {
        }
        // we use the errorDocs to mark all edges with endpoint to error documents
        Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();
        Map.Entry<String, HyperlinkEdge> edge;
        while (i.hasNext()) {
            edge = i.next();
            if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
                i.remove();
                edge.getValue().type = HyperlinkEdge.Type.Dead;
                errorEdges.put(edge.getKey(), edge.getValue());
            }
        }
        i = outboundEdges.entrySet().iterator();
        while (i.hasNext()) {
            edge = i.next();
            if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {
                i.remove();
                edge.getValue().type = HyperlinkEdge.Type.Dead;
                errorEdges.put(edge.getKey(), edge.getValue());
            }
        }
        // we put all edges together in a specific order which is used to create nodes in a svg display:
        // notes that appear first are possible painted over by nodes coming later.
        // less important nodes shall appear therefore first
        Map<String, HyperlinkEdge> edges = new LinkedHashMap<String, HyperlinkEdge>();
        edges.putAll(outboundEdges);
        edges.putAll(inboundEdges);
        edges.putAll(errorEdges);
        
        // finally just write out the edge array
        int c = 0;
        for (Map.Entry<String, HyperlinkEdge> e: edges.entrySet()) {
            prop.putJSON("list_" + c + "_source", e.getValue().source.getPath());
            prop.putJSON("list_" + c + "_target", e.getValue().type.equals(HyperlinkEdge.Type.Outbound) ? e.getValue().target.toNormalform(true) : e.getValue().target.getPath());
            prop.putJSON("list_" + c + "_type", e.getValue().type.name());
            prop.put("list_" + c + "_eol", 1);
            c++;
        }
        prop.put("list_" + (c-1) + "_eol", 0);
        prop.put("list", c);

        // Adding CORS Access header for xml output
        if (xml) {
            final ResponseHeader outgoingHeader = new ResponseHeader(200);
            outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");
            prop.setOutgoingHeader(outgoingHeader);
        }
        
        // return rewrite properties
        return prop;
    }

}
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`// linkstructure.java`
			`// ------------`
			`// (C) 2014 by Michael Peter Christen; mc@yacy.net`
			`// first published 02.04.2014 on http://yacy.net`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`


			`import java.net.MalformedURLException;`
			`import java.util.HashMap;`
			`import java.util.Iterator;`
enhanced the new link structure graph 11 years ago			`import java.util.LinkedHashMap;`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`import java.util.Map;`
			`import java.util.concurrent.BlockingQueue;`

			`import org.apache.solr.common.SolrDocument;`

			`import net.yacy.cora.document.encoding.ASCII;`
			`import net.yacy.cora.document.id.DigestURL;`
			`import net.yacy.cora.federate.solr.FailType;`
			`import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;`
			`import net.yacy.cora.order.Base64Order;`
			`import net.yacy.cora.protocol.HeaderFramework;`
			`import net.yacy.cora.protocol.RequestHeader;`
			`import net.yacy.cora.protocol.ResponseHeader;`
			`import net.yacy.kelondro.data.meta.URIMetadataNode;`
			`import net.yacy.search.Switchboard;`
			`import net.yacy.search.index.Fulltext;`
			`import net.yacy.search.schema.CollectionSchema;`
moved Edge class out of linkstructure servlet as this does not work on non-eclipse driven environments (all non-dev cases) 11 years ago			`import net.yacy.search.schema.HyperlinkEdge;`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`import net.yacy.server.serverObjects;`
			`import net.yacy.server.serverSwitch;`
			`import net.yacy.server.servletProperties;`

			`public class linkstructure {`

			`public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {`
			`final servletProperties prop = new servletProperties();`

			`final String ext = header.get("EXT", "");`
			`//final boolean json = ext.equals("json");`
			`final boolean xml = ext.equals("xml");`

			`final Switchboard sb = (Switchboard) env;`
			`Fulltext fulltext = sb.index.fulltext();`
			`if (post == null) return prop;`
			`String about = post.get("about", null); // may be a URL, a URL hash or a domain hash`
			`if (about == null) return prop;`
			`boolean authenticated = sb.adminAuthenticated(header) >= 2;`
			`int maxtime = Math.min(post.getInt("maxtime", 1000), authenticated ? 60000 : 1000);`
			`int maxnodes = Math.min(post.getInt("maxnodes", 100), authenticated ? 1000 : 100);`

			`DigestURL url = null;`
			`String hostname = null;`
			`if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {`
			`byte[] urlhash = ASCII.getBytes(about);`
			`url = authenticated ? sb.getURL(urlhash) : null;`
			`} else if (url == null && about.length() > 0) {`
			`// consider "about" as url or hostname`
			`try {`
			`url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains`
			`hostname = url.getHost();`
			`if (hostname.startsWith("www.")) hostname = hostname.substring(4);`
			`} catch (final MalformedURLException e) {`
			`}`
			`}`
			`if (hostname == null) return prop;`

			`// now collect _all_ documents inside the domain until a timeout appears`
			`StringBuilder q = new StringBuilder();`
			`q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);`
			`BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, maxnodes, maxtime, 100, 1,`
			`CollectionSchema.id.getSolrFieldName(),`
			`CollectionSchema.sku.getSolrFieldName(),`
			`CollectionSchema.failreason_s.getSolrFieldName(),`
			`CollectionSchema.failtype_s.getSolrFieldName(),`
			`CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),`
			`CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),`
			`CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),`
			`CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()`
			`);`
			`SolrDocument doc;`
			`Map<String, FailType> errorDocs = new HashMap<String, FailType>();`
enhanced the new link structure graph 11 years ago			`Map<String, HyperlinkEdge> inboundEdges = new HashMap<String, HyperlinkEdge>();`
			`Map<String, HyperlinkEdge> outboundEdges = new HashMap<String, HyperlinkEdge>();`
			`Map<String, HyperlinkEdge> errorEdges = new HashMap<String, HyperlinkEdge>();`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`try {`
			`while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {`
			`String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());`
			`String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());`
			`DigestURL from = new DigestURL(u, ASCII.getBytes(ids));`
			`String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());`
			`FailType error = errortype == null ? null : FailType.valueOf(errortype);`
			`if (error != null) {`
			`errorDocs.put(u, error);`
			`} else {`
			`Iterator<String> links = URIMetadataNode.getLinks(doc, true); // inbound`
			`String link;`
			`while (links.hasNext()) {`
			`link = links.next();`
			`try {`
			`DigestURL linkurl = new DigestURL(link, null);`
			`String edgehash = ids + ASCII.String(linkurl.hash());`
enhanced the new link structure graph 11 years ago			`inboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Inbound));`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`} catch (MalformedURLException e) {}`
			`}`
			`links = URIMetadataNode.getLinks(doc, false); // outbound`
			`while (links.hasNext()) {`
			`link = links.next();`
			`try {`
			`DigestURL linkurl = new DigestURL(link, null);`
			`String edgehash = ids + ASCII.String(linkurl.hash());`
enhanced the new link structure graph 11 years ago			`outboundEdges.put(edgehash, new HyperlinkEdge(from, linkurl, HyperlinkEdge.Type.Outbound));`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`} catch (MalformedURLException e) {}`
			`}`
			`}`
enhanced the new link structure graph 11 years ago			`if (inboundEdges.size() + outboundEdges.size() > maxnodes) break;`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`}`
			`} catch (InterruptedException e) {`
			`} catch (MalformedURLException e) {`
			`}`
			`// we use the errorDocs to mark all edges with endpoint to error documents`
enhanced the new link structure graph 11 years ago			`Iterator<Map.Entry<String, HyperlinkEdge>> i = inboundEdges.entrySet().iterator();`
			`Map.Entry<String, HyperlinkEdge> edge;`
			`while (i.hasNext()) {`
			`edge = i.next();`
			`if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {`
			`i.remove();`
			`edge.getValue().type = HyperlinkEdge.Type.Dead;`
			`errorEdges.put(edge.getKey(), edge.getValue());`
			`}`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`}`
enhanced the new link structure graph 11 years ago			`i = outboundEdges.entrySet().iterator();`
			`while (i.hasNext()) {`
			`edge = i.next();`
			`if (errorDocs.containsKey(edge.getValue().target.toNormalform(true))) {`
			`i.remove();`
			`edge.getValue().type = HyperlinkEdge.Type.Dead;`
			`errorEdges.put(edge.getKey(), edge.getValue());`
			`}`
			`}`
			`// we put all edges together in a specific order which is used to create nodes in a svg display:`
			`// notes that appear first are possible painted over by nodes coming later.`
			`// less important nodes shall appear therefore first`
			`Map<String, HyperlinkEdge> edges = new LinkedHashMap<String, HyperlinkEdge>();`
			`edges.putAll(outboundEdges);`
			`edges.putAll(inboundEdges);`
			`edges.putAll(errorEdges);`

- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`// finally just write out the edge array`
			`int c = 0;`
enhanced the new link structure graph 11 years ago			`for (Map.Entry<String, HyperlinkEdge> e: edges.entrySet()) {`
			`prop.putJSON("list_" + c + "_source", e.getValue().source.getPath());`
			`prop.putJSON("list_" + c + "_target", e.getValue().type.equals(HyperlinkEdge.Type.Outbound) ? e.getValue().target.toNormalform(true) : e.getValue().target.getPath());`
			`prop.putJSON("list_" + c + "_type", e.getValue().type.name());`
- added d3.js for visualizations using embedded svg - added a servlet api/linkstructure.json which generates a link graph information in json - added a javascript link graph renderer hypertree.js using d3 and the new servlet linkstructure.json - embedded the new link graph in the crawler monitor and the host browser 11 years ago			`prop.put("list_" + c + "_eol", 1);`
			`c++;`
			`}`
			`prop.put("list_" + (c-1) + "_eol", 0);`
			`prop.put("list", c);`

			`// Adding CORS Access header for xml output`
			`if (xml) {`
			`final ResponseHeader outgoingHeader = new ResponseHeader(200);`
			`outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");`
			`prop.setOutgoingHeader(outgoingHeader);`
			`}`

			`// return rewrite properties`
			`return prop;`
			`}`

			`}`