From cef8ebc41da77caef39da8c71cc68715061a6012 Mon Sep 17 00:00:00 2001 From: cominch Date: Tue, 15 Nov 2011 12:22:19 +0000 Subject: [PATCH] getpageinfo: Checks if there is a OAI repository behind the URL. This check is only performed if oai parameter is set when calling e.g. getpageinfo_p.xml?actions=oai git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8039 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/api/getpageinfo_p.java | 76 +++++++++++++++++++++++++++++++++++ htroot/api/getpageinfo_p.xml | 1 + 2 files changed, 77 insertions(+) diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 68c490807..ba1db4cdf 100755 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -14,6 +14,15 @@ import de.anomic.crawler.RobotsTxtEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + public class getpageinfo_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -30,6 +39,7 @@ public class getpageinfo_p { prop.put("favicon",""); prop.put("sitelist", ""); prop.put("filter", ".*"); + prop.put("oai", 0); // default actions String actions = "title,robots"; @@ -125,10 +135,76 @@ public class getpageinfo_p { Log.logException(e); } } + if (actions.indexOf("oai") >= 0) { + try { + final DigestURI theURL = new DigestURI(url + + "?verb=Identify"); + + String oairesult = checkOAI(theURL.toString()); + + prop.put("oai", oairesult == "" ? 0 : 1); + + if (oairesult != "") { + prop.putXML("title", oairesult); + } + + } catch (final MalformedURLException e) { + } + } } // return rewrite properties return prop; } + + private static String checkOAI(final String url) { + final DocumentBuilderFactory factory = DocumentBuilderFactory + .newInstance(); + try { + final DocumentBuilder builder = factory.newDocumentBuilder(); + return parseXML(builder.parse(url)); + } catch (final ParserConfigurationException ex) { + Log.logException(ex); + } catch (final SAXException ex) { + Log.logException(ex); + } catch (final IOException ex) { + Log.logException(ex); + } + + return ""; + } + + private static String parseXML(final Document doc) { + + String repositoryName = null; + + final NodeList items = doc.getDocumentElement().getElementsByTagName( + "Identify"); + if (items.getLength() == 0) { + return ""; + } + + for (int i = 0, n = items.getLength(); i < n; ++i) { + + if (!"Identify".equals(items.item(i).getNodeName())) + continue; + + final NodeList currentNodeChildren = items.item(i).getChildNodes(); + + for (int j = 0, m = currentNodeChildren.getLength(); j < m; ++j) { + final Node currentNode = currentNodeChildren.item(j); + if ("repositoryName".equals(currentNode.getNodeName())) { + repositoryName = currentNode.getFirstChild().getNodeValue(); + } + } + + if (repositoryName == null) { + return ""; + } + + } + return repositoryName; + } + } diff --git a/htroot/api/getpageinfo_p.xml b/htroot/api/getpageinfo_p.xml index 84da4eb97..664e1972c 100644 --- a/htroot/api/getpageinfo_p.xml +++ b/htroot/api/getpageinfo_p.xml @@ -19,4 +19,5 @@ #{/links}# + #[oai]#