improve OAI-PMH import identifier recognition

- find best fittng identifier (url) by checking all given dc:identifier in record (many entries proviede several identifiers)
  as identifier is currently a multivalued field use "getParams" in preference of splitting the 1st string by ";" 
- add resolve DOI:... identifier via http://dx.doi.org/
pull/1/head
reger 11 years ago
parent 0e7d249a69
commit 5c9dcc269d

@ -55,7 +55,7 @@ public class IndexImportOAIPMHList_p {
prop.put("source_table_" + count + "_dark", (dark) ? "1" : "0");
prop.put("source_table_" + count + "_count", count);
prop.put("source_table_" + count + "_source", CharacterCoding.unicode2html(root, true));
prop.put("source_table_" + count + "_loadurl", "<a href=\"/IndexImportOAIPMH_p.html?urlstart=" + CharacterCoding.unicode2html(root, true) + "\" target=\"_top\">" + CharacterCoding.unicode2html(root, true) + "</a>");
prop.put("source_table_" + count + "_loadurl", "<a href=\"IndexImportOAIPMH_p.html?urlstart=" + CharacterCoding.unicode2html(root, true) + "\" target=\"_top\">" + CharacterCoding.unicode2html(root, true) + "</a>");
dark = !dark;
count++;
}

@ -115,12 +115,22 @@ public class DCEntry extends MultiMapSolrParams {
}
}
/**
* get Identifier (url) (so far only used for surrogate processing)
* @param useRelationAsAlternative true = take relation if no identifier resolves to url
* @return
*/
public DigestURL getIdentifier(boolean useRelationAsAlternative) {
// identifier may be included multiple times (with all kinds of syntax - example is from on record)
// <dc:identifier>Astronomy and Astrophysics, 539, A99, 2012</dc:identifier>
// <dc:identifier>http://hdl.handle.net/2104/8302</dc:identifier>
// <dc:identifier>10.1051/0004-6361/201117940</dc:identifier>
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
if (u == null) return useRelationAsAlternative ? getRelation() : null;
String[] urls = u.split(";");
if (urls.length > 1) {
String[] urls = null;
if (u == null) urls = this.getParams("dc:identifier");
if (urls == null) return useRelationAsAlternative ? getRelation() : null;
// String[] urls = u.split(";"); // splitting may not succeed (see above)
if (urls.length > 0) { // check best also with 1 in case it's not http urn
// select one that fits
u = bestU(urls);
}
@ -154,16 +164,27 @@ public class DCEntry extends MultiMapSolrParams {
}
private static String bestU(String[] urls) {
if (urls.length > 1) { // with only one ... no choice
for (String uu: urls) {
if (uu.startsWith("http://") && (uu.endsWith(".html") || uu.endsWith(".htm") || uu.endsWith(".pdf") || uu.endsWith(".doc") || uu.endsWith(".rss") || uu.endsWith(".xml"))) return uu;
}
for (String uu: urls) {
if (uu.startsWith("http://")) return uu;
}
for (String uu: urls) {
if (uu.startsWith("https://")) return uu;
}
for (String uu: urls) {
if (uu.startsWith("ftp://")) return uu;
}
} // but check urn:/doi: resolve
for (String uu: urls) {
//doi identifier can be resolved through dx.doi.org:
// http://dx.doi.org/doi:10.12775/SIT.2010.010
//or http://dx.doi.org/10.12775/SIT.2010.010
if (uu.startsWith("DOI:") || uu.startsWith("doi:")) // saw it upper & lower-case
return "http://dx.doi.org/" + uu;
//urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
//http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
if (uu.startsWith("urn:")) return "http://nbn-resolving.de/" + uu;

Loading…
Cancel
Save