Added an optional parameter to webstructure.xml api.

This new "documentStructure" parameter can be set to false to only get hosts accumulated references on a resource and thus prevent scraping the specified URL and getting citations references. Also set WebStructureGraph constants as final and updated the Javadoc with example api call URLs.
8 years ago · e048e74072
parent 581b00cc20
commit e048e74072
2 changed files with 42 additions and 11 deletions
--- a/htroot/api/webstructure.java
+++ b/htroot/api/webstructure.java
@ -53,10 +53,10 @@ import net.yacy.server.serverSwitch;
 public class webstructure {

 	/**
-	 * Retrieve the locally known web links structure of a specified resource ("about" parameter supplied) or
+	 * <p>Retrieve the locally known web links structure of a specified resource ("about" parameter supplied) or
 	 * the whole computed links structure since install (no parameter supplied)
-	 * or since last start or last call ("latest" parameter supplied).
-	 * Returned object contains the following information :
+	 * or since last start or last call ("latest" parameter supplied).</p>
+	 * <p>Returned object contains the following information :
 	 * <ul>
 	 * 	<li>in all cases :
 	 * 		<ul>
@ -66,12 +66,30 @@ public class webstructure {
 	 *  <li>when "about" parameter is filled :
 	 *  	<ul>
 	 * 			<li>accumulated list of incoming links from other domains (per host accumulated references)</li>
-	 * 			<li>detailed list of outgoing links (anchors) from documents to references</li>
+	 * 			<li>detailed list of outgoing links (anchors) from document at "about" URL to references</li>
     * 			<li>detailed list of incoming links (citations) from other documents (their references) - reverse link structure</li>
     * 		</ul>
     * 	</li>
 	 * </ul>
-	 * Information detail is limited by {@link WebStructureGraph#maxhosts} and {@link WebStructureGraph#maxref} constants.
+	 * <p>
+	 * Remarks :
+	 * <ul>
+	 * <li>Information detail is limited by {@link WebStructureGraph#maxhosts}, {@link WebStructureGraph#maxref} and {@link WebStructureGraph#MAX_PARSED_ANCHORS} constants.</li>
+	 * <li>Requesting client must be authenticated (as admin or requesting from localhost enabled) otherwise results will be empty</li>
+	 * </ul>
+	 * </p>
+	 * 
+	 * <p>
+	 * Example API calls :
+	 * <ul>
+	 * <li>domain name and index page structure : http://localhost:8090/api/webstructure.xml?about=yacy.net</li>
+	 * <li>domain name structure : http://localhost:8090/api/webstructure.xml?about=yacy.net&documentStructure=false</li>
+     * <li>hosts accumulated structure and specific resource structure : http://localhost:8090/api/webstructure.xml?about=http://yacy.net/fr/API.html</li>
+	 * <li>whole locally known hosts web structure : http://localhost:8090/api/webstructure.xml</li>
+	 * <li>recently locally computed hosts web structure : http://localhost:8090/api/webstructure.xml?latest=</li>
+	 * </ul>
+	 * </p>
+	 * 
 	 * 
 	 * @param header
 	 *            servlet request header
@ -84,8 +102,8 @@ public class webstructure {
 	 *            <li>latest (ignored when about parameter is valued): get the structure that have been computed during
 	 *            the current run-time of YaCy, and with each next call only an
 	 *            update to the next list of references.</li>
-	 *            <li>agentName : name of the user agent string used to load the
-	 *            "about" resource</li>
+	 *            <li>agentName : name of the user agent string used to load the "about" resource</li>
+	 *            <li>documentStructure : set to false when you only want the hosts accumulated references for the "about" resource</li>
 	 *            </ul>
 	 * @param env
 	 *            server environment
@ -153,7 +171,17 @@ public class webstructure {
        		prop.put("out_domains", outCount);
        		prop.put("in_domains", inCount);
            }
-            if (urlhash != null) {
+            
+			/*
+			 * It is possible not to scrape document and look for citations by
+			 * setting documentStructure parameter to "false"
+			 */
+			boolean documentStructure = true;
+			if (post != null && "false".equals(post.get("documentStructure", "true"))) {
+				documentStructure = false;
+			}
+            
+            if (urlhash != null && documentStructure) {
            	// anchors
                prop.put("references", 1);
                net.yacy.document.Document scraper = null;
--- a/source/net/yacy/peers/graphics/WebStructureGraph.java
+++ b/source/net/yacy/peers/graphics/WebStructureGraph.java
@ -71,10 +71,13 @@ import net.yacy.search.Switchboard;
 public class WebStructureGraph {

 	/** Maximum number of references per host, to avoid overflow when a large link farm occurs (i.e. wikipedia) */
-    public static int maxref = 200;
+    public static final int maxref = 200;
    
    /** Maximum number of hosts in web structure map */
-    public static int maxhosts = 10000;
+    public static final int maxhosts = 10000;
+    
+    /** Maximum number of parsed anchors when computing the structure of a newly added document */
+    public static final int MAX_PARSED_ANCHORS = 1000;

    private final static ConcurrentLog log = new ConcurrentLog("WebStructureGraph");

@ -207,7 +210,7 @@ public class WebStructureGraph {
        final HashSet<DigestURL> globalRefURLs = new HashSet<DigestURL>();
        final String refhost = url.getHost();
        DigestURL u;
-        int maxref = 1000;
+        int maxref = MAX_PARSED_ANCHORS;
        while ( it.hasNext() && maxref-- > 0 ) {
            u = it.next();
            if ( u == null ) {