diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index f0286c027..dd4e42013 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -52,10 +52,45 @@ import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
-
+/**
+ * Remote resource analyzer
+ */
public class getpageinfo_p {
- public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
+ /**
+ *
Scrape and parse a resource at a specified URL to provide some information, depending on the requested actions.
+ *
+ *
+ * Example API calls :
+ *
+ * - With the minimum required parameters : http://localhost:8090/api/getpageinfo_p.xml?url=http://yacy.net
+ * - Only check the robots.txt policy and sitemap presence : http://localhost:8090/api/getpageinfo_p.xml?url=https://en.wikipedia.org/wiki/Main_Page&actions=robots
+ * - Only check for an OAI Repository at CiteSeerX : http://localhost:8090/api/getpageinfo_p.xml?url=http://citeseerx.ist.psu.edu/oai2&actions=oai
+ *
+ *
+ *
+ *
+ * @param header
+ * servlet request header
+ * @param post
+ * request parameters. Supported keys :
+ *
+ * - url (required) : the URL of the resource to analyze. HTTP protocol is assumed if not present at the beginning of the URL.
+ * - actions (optional) : a list of comma separated actions to perform (default to "title,robots"). Supported actions :
+ *
+ * - title : look for the resource title, description, language, icons, keywords, and links
+ * - robots : check if crawling the resource is allowed by the eventual robots.txt policy file, and also if this file exposes sitemap(s) URLs.
+ * - oai : send an "Identify" OAI-PMH request (http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify)
+ * at the URL to check for a OAI-PMH response from an Open Archive Initiative Repository
+ *
+ *
+ * - agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"
+ *
+ * @param env
+ * server environment
+ * @return the servlet answer object
+ */
+ public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
@@ -197,6 +232,10 @@ public class getpageinfo_p {
return prop;
}
+ /**
+ * @param url an OIA-PHM "Identify" request URL (http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify). Must not be null.
+ * @return the OAI Repository name or an empty String when the response could not be parsed as an OAI-PMH response
+ */
private static String checkOAI(final String url) {
final DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
@@ -214,6 +253,11 @@ public class getpageinfo_p {
return "";
}
+ /**
+ * Extract the OAI repository name from an OAI-PMH "Identify" response
+ * @param doc an XML document to parse. Must not be null.
+ * @return the repository name or an empty String when the XML document is not an OAI-PMH "Identify" response
+ */
private static String parseXML(final Document doc) {
String repositoryName = null;