Added JavaDoc to the getpageinfo_p API servlet.

pull/122/head
luccioman 8 years ago
parent c226ded799
commit cbccf97361

@ -52,10 +52,45 @@ import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
/**
* Remote resource analyzer
*/
public class getpageinfo_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
/**
* <p>Scrape and parse a resource at a specified URL to provide some information, depending on the requested actions.</p>
*
* <p>
* Example API calls :
* <ul>
* <li>With the minimum required parameters : http://localhost:8090/api/getpageinfo_p.xml?url=http://yacy.net</li>
* <li>Only check the robots.txt policy and sitemap presence : http://localhost:8090/api/getpageinfo_p.xml?url=https://en.wikipedia.org/wiki/Main_Page&actions=robots</li>
* <li>Only check for an OAI Repository at CiteSeerX : http://localhost:8090/api/getpageinfo_p.xml?url=http://citeseerx.ist.psu.edu/oai2&actions=oai</li>
* </ul>
* </p>
*
*
* @param header
* servlet request header
* @param post
* request parameters. Supported keys :
* <ul>
* <li>url (required) : the URL of the resource to analyze. HTTP protocol is assumed if not present at the beginning of the URL.</li>
* <li>actions (optional) : a list of comma separated actions to perform (default to "title,robots"). Supported actions :
* <ul>
* <li>title : look for the resource title, description, language, icons, keywords, and links</li>
* <li>robots : check if crawling the resource is allowed by the eventual robots.txt policy file, and also if this file exposes sitemap(s) URLs.</li>
* <li>oai : send an "Identify" OAI-PMH request (http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify)
* at the URL to check for a OAI-PMH response from an Open Archive Initiative Repository</li>
* </ul>
* </li>
* <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
* </ul>
* @param env
* server environment
* @return the servlet answer object
*/
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
@ -197,6 +232,10 @@ public class getpageinfo_p {
return prop;
}
/**
* @param url an OIA-PHM "Identify" request URL (http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify). Must not be null.
* @return the OAI Repository name or an empty String when the response could not be parsed as an OAI-PMH response
*/
private static String checkOAI(final String url) {
final DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
@ -214,6 +253,11 @@ public class getpageinfo_p {
return "";
}
/**
* Extract the OAI repository name from an OAI-PMH "Identify" response
* @param doc an XML document to parse. Must not be null.
* @return the repository name or an empty String when the XML document is not an OAI-PMH "Identify" response
*/
private static String parseXML(final Document doc) {
String repositoryName = null;

Loading…
Cancel
Save